def pipeline(x_train, y_train, x_val, y_val, x_test, y_test, num_classes, img_rows, img_cols, batch_size, epochs, augment): x_train, y_train, x_val, y_val, x_test, y_test, input_shape = pre_process( x_train, y_train, x_val, y_val, x_test, y_test, num_classes, img_rows, img_cols) model, history = apply_training(x_train, y_train, x_val, y_val, batch_size, epochs, input_shape, num_classes, augment) return model, history, x_train, y_train, x_val, y_val, x_test, y_test, input_shape
def data2image(data_vect, labels, num_classes, one_hot): # input image dimensions fv = StructDataTransformer() fv.fit(data_vect.to_numpy(), labels.to_numpy()) img_rows, img_cols = fv.image_dim, fv.image_dim # pre-processing fv.imgs, labels, input_shape, data_mn, data_std = pre_process( fv.imgs, img_rows, img_cols, labels, num_classes, one_hot) return fv, labels, input_shape, data_mn, data_std
def cross_validation(hyper_parameters): # extract hyper parameters of the neural net features = hyper_parameters[0] num_neurons = hyper_parameters[1] p_value = hyper_parameters[2] lr_1 = hyper_parameters[3] lr_2 = hyper_parameters[4] lr_3 = hyper_parameters[5] # start training and testing # pre-process the data, using the function defined in preprocessing.py data = pre_process() # keep only chosen features columns = np.append(features, [1] * output_size) features_idx = [i for i, x in enumerate(columns) if x == 1] data = data.iloc[:, features_idx] # split data for later use (k cross validation) splitted_data = np.split(data, k_cross_validation) # train using cross validation all_train_losses = [] all_test_losses = [] all_train_correctness = [] all_test_correctness = [] for i in range(k_cross_validation): # extract train and test data, split input and target X_train, Y_train = train_data(splitted_data, i) X_test, Y_test = test_data(splitted_data, i) # train the model and print loss, confusion matrix and correctness reg_model, loss, correctness = train(X_train, Y_train, num_neurons, p_value, lr_1, lr_2, lr_3) # test the model on test data test_loss, test_correctness = test(X_test, Y_test, reg_model) # append losses and correctness all_train_losses.append(loss) all_test_losses.append(test_loss) all_train_correctness.append(correctness) all_test_correctness.append(test_correctness) # print average loss and correctness on training and testing data train_loss_avg = (sum(all_train_losses) / len(all_train_losses)).item() test_loss_avg = (sum(all_test_losses) / len(all_test_losses)).item() print('average loss on training data', train_loss_avg) print('average loss on testing data', test_loss_avg) train_correctness_avg = sum(all_train_correctness) / len( all_train_correctness) test_correctness_avg = sum(all_test_correctness) / len( all_test_correctness) print('average correctness on training data', train_correctness_avg) print('average correctness on testing data', test_correctness_avg) print('') # display performance of each model if plot_each_run: # losses plt.figure() plt.plot(all_train_losses, label='training data', color='blue') plt.plot(all_test_losses, label='testing data', color='red') plt.axhline(y=train_loss_avg, linestyle=':', label='training data average loss', color='blue') plt.axhline(y=test_loss_avg, linestyle=':', label='testing data average loss', color='red') plt.legend() plt.title('losses of model on training and testing data') plt.show() # correctness plt.figure() plt.plot(all_train_correctness, label='training data', color='blue') plt.plot(all_test_correctness, label='testing data', color='red') plt.axhline(y=train_correctness_avg, linestyle=':', label='training data average correctness', color='blue') plt.axhline(y=test_correctness_avg, linestyle=':', label='testing data average correctness', color='red') plt.legend() plt.title('correctness of model on training and testing data') plt.show() print("settings: ", features_idx, num_neurons, p_value, lr_1, lr_2, lr_3) print("---------------------------------------\n") return test_correctness_avg, train_correctness_avg, test_loss_avg, train_loss_avg
def run(): train_x, train_y = get_data('Dataset/haspeede2_dev_taskAB.tsv') lang = ['italian'] models = ['log_reg'] embeddings = ['tfidf'] corpus = [] for i in range(0, len(train_x)): sentence = preprocessing.pre_process(train_x[i].lower(), False) if sentence != ' ': # sentence = get_stem(lang[1], sentence) corpus.append(sentence) else: train_y = np.delete(train_y, i, 0) train_x = corpus best_score = 0.0 best_model = None best_embedding = None for e in embeddings: embedded_train_x = get_embeddings(lang[0], train_x, "train_", e, False) for m in models: print("Lang: " + lang[0] + "\tEmbeddings: " + e + "\tModel: " + m) score, model = classifier(embedded_train_x, train_y, m) print("SCORE: " + str(score)) if score > best_score: best_score = score best_model = model best_embedding = e print("BEST MODEL: ") print(best_model) print(m + " " + best_embedding + ": " + str(score)) embedded_train_x = get_embeddings(lang[0], train_x, "train_", best_embedding, False) best_model.fit(embedded_train_x, train_y) with open("taskA/model_" + m + "_" + e + ".pk", "wb") as fout: pickle.dump(best_model, fout) print("Model saved.") #####TESTING on TWEETS ######### test = [] test_x = get_test_data('Dataset/haspeede2_test_taskAB-tweets.tsv') for i in range(0, len(test_x)): sentence = preprocessing.pre_process(test_x[i].lower(), False) test.append(sentence) test_x = test test_x = get_embeddings(lang[0], test_x, "test_tweets_", best_embedding, True) test_y = best_model.predict(test_x) with open("taskA/test_y_tweets_" + m + e + ".pk", "wb") as testyout: pickle.dump(test_y, testyout) #####TESTING on NEWS ######### test = [] test_x = get_test_data('Dataset/haspeede2-test_taskAB-news.tsv') for i in range(0, len(test_x)): sentence = preprocessing.pre_process(test_x[i].lower(), False) test.append(sentence) test_x = test test_x = get_embeddings(lang[0], test_x, "test_news_", best_embedding, True) test_y = best_model.predict(test_x) with open("taskA/test_y_news_" + m + "_" + e + ".pk", "wb") as testyout: pickle.dump(test_y, testyout) return
imp_numerical = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False) val[:, j] = imp_numerical.fit_transform(val[:, j].reshape(-1, 1)).T #labelencoder for j in range(val.shape[1]): if j == val.shape[1] - 1 or categorical[j]: val[:, j] = le.fit_transform(val[:, j]) #one-hot-encoding and standardization data_numeric = val[:, :-1] data_labels = val[:, -1] data_numeric, categorical = preprocessing.pre_process( raw_data=data_numeric, categorical=categorical, impute=False, standardize=True, one_hot_encode=True) val = np.append(data_numeric, np.array(data_labels, ndmin=2).T, axis=1) pd.DataFrame(val, index=None, columns=None).to_csv('{}/{}.csv'.format( dirname, dataset_name), index=None, header=None) print("dataset {} finished \n \n".format(dataset_name)) except: print("!!! error encountered on {} \n \n".format(dataset_name))
import preprocessing from preprocessing import pre_process import math features = ['Budget', 'Runtime', 'vote_average', 'Popularity', 'vote_count'] complex_features = ['Genres'] class DenseTransformer(TransformerMixin): def transform(self, X, y=None, **fit_params): return X.todense() def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return self.transform(X) def fit(self, X, y=None, **fit_params): return self dynamodb = boto3.resource('dynamodb') table = dynamodb.Table('movies') rows = table.scan(FilterExpression=Attr('Budget').gt(1000))['Items'] data, target = pre_process(rows, features, complex_features) reg = make_pipeline(DictVectorizer(), DenseTransformer(), LinearRegression()) scores = cross_val_score(reg, data, target, cv=10) print(scores) print(np.mean(np.array(scores)))
def confusion_matrix(Y, Y_predicted): confusion = torch.zeros(5, 5) correct_num = 0 for i in range(Y.shape[0]): actual_class = interpret_output(Y[i]) predicted_class = interpret_output(Y_predicted[i]) confusion[actual_class[1]][predicted_class[1]] += 1 if actual_class == predicted_class: correct_num += 1 return confusion, correct_num ################################ main ################################### if __name__ == "__main__": # pre-process the data, using the function defined in preprocessing.py data = pre_process() # split data for later use (k cross validation) splitted_data = np.split(data, k_cross_validation) # train using cross validation all_train_losses = [] all_test_losses = [] all_train_correctness = [] all_test_correctness = [] for i in range(k_cross_validation): # extract train and test data, split input and target X_train, Y_train = train_data(splitted_data, i) X_test, Y_test = test_data(splitted_data, i) # train the model and print loss, confusion matrix and correctness
### WAND Run ### import wandb import logConfig, load_data, accuracy_loss, train, preprocessing, plot train_X, train_Y, test_X, test_Y, labels = load_data.load_data() (N, w, h), n_labels = train_X.shape, len(labels) # Dimension of datapoints d = w * h # Data Preprocessing (train_x, train_y), (val_x, val_y), (test_x, test_y) = preprocessing.pre_process(d, n_labels, train_X, train_Y, test_X, test_Y) def main(config = None): run = wandb.init(config=config, resume=True) config = wandb.config hl = [config.hidden_layer_size] * config.hidden_layers # Hidden layers ol = [len(train_y[0])] # Output layers n_hl = len(hl) name = "hl_" + str(config.hidden_layers) + "_bs_" + str(config.batch_size) + "_ac_" + config.ac run.name = name logConfig.logConfig(config) # Set Loss function here loss_functions = [ "cross_entropy", "sq_loss" ]
def sentiment(movie_name): #print(datetime.now()) model = joblib.load('./Sentimov/sentiment/model.sav') feature_words = fwords.create_feature_words('./Sentimov/sentiment/feature_words.txt') emo_list = fwords.create_feature_words('./Sentimov/sentiment/emo_list.txt') word_features = feature_words + emo_list name = movie_name.replace(" ", "").replace(":", "").replace("'","").replace("-","") hashtag = str("#" + name) inputf = open("./Sentimov/sentiment/" + name + ".txt") lines=inputf.readlines() inputf.close() output = open("./Sentimov/sentiment/" + name + '.tsv', 'w') tweetlist=[] pos_list=[] neg_list=[] neu_list=[] #count = 0; for line in lines: #count = count + 1 if line!="\n": try: d = json.loads(line) text = d[u'text'] tweet = preprocessing.pre_process(text, hashtag) text = text.replace("\n", "") text = html_parser.unescape(text).encode('utf-8') tweetlist.append(tweet) #remove stopword and specailcharacter stop_words = set(stopwords.words('english')) #word_tokens = TweetTokenizer.tokenize(tweet) word_tokens = word_tokenize(tweet) #word_tokens = cleanText.remove_repeated_characters(word_tokens) label = model.classify(fwords.find_features(word_tokens,word_features)) arr = [] if ('retweeted_status' in d): arr.append(d[u'retweeted_status'][u'text']) arr.append(d[u'created_at']) arr.append(d[u'retweeted_status'][u'user'][u'screen_name']) arr.append(d[u'retweeted_status'][u'id_str']) arr.append(d[u'retweeted_status'][u'user'][u'profile_image_url_https']) arr.append("retweeted") arr.append(d[u'user'][u'screen_name']) else: arr.append(text) arr.append(d[u'created_at']) arr.append(d[u'user'][u'screen_name']) arr.append(d[u'id_str']) arr.append(d[u'user'][u'profile_image_url_https']) arr.append("not_retweeted") if (label == "positive"): pos_list.append(arr) output.write(str(text)+"\tpositive\n") elif (label == "negative"): neg_list.append(arr) output.write(str(text)+"\tnegative\n") else: neu_list.append(arr) output.write(str(text)+"\tneutral\n") except: pass output.close() result = {'pos':pos_list, 'neg':neg_list, 'neu':neu_list} #print(result) return result
[tokenized_sentences[i] for i in [pair[0] for pair in top_sentences]]) return summary if __name__ == '__main__': article_no = int(parse_args().article_no) print '==== Summarising article No: {} ===='.format(article_no) news_data = preprocessing.get_news_data_from_csv() print news_data[article_no]['title'] print 'Article' print news_data[article_no]['article'] clean_data = preprocessing.pre_process(news_data) print '==== Data Pre Processing Complete ====' lemmatiser = WordNetLemmatizer() stopwords_list = get_stopwords() corpus_data = map(lambda record: record['article'], clean_data) corpus_data = set(corpus_data) # Uncomment the following lines, if retraining the Count Vectorizer # count_vect = CountVectorizer() # count_vect = count_vect.fit(corpus_data) # util.save_to_disk(count_vect, current_directory + '/pickle_objects/count_vect') count_vect = util.load_from_disk(current_directory + '/pickle_objects/count_vect') freq_term_matrix = count_vect.transform(corpus_data)
test_path = os.path.join(output_path, "test") for i, author in enumerate(authors): # bag = Counter() author_path = os.path.join(test_path, author) files_of_author = list_files(author_path) for filename in files_of_author: file_path = os.path.join(author_path, filename) tokens = tokenize_file(file_path) author_candidates = calculate_probability_of_author( tokens=tokens, training_bags=training_bags, doc_counts=doc_counts) candidate_index = authors.index(author_candidates[0][0]) confusion_matrix[i, candidate_index] += 1 # print(confusion) return confusion_matrix if __name__ == "__main__": print("main func") num_documents_by_author = pre_process() training_bag_of_author, doc_count_of_author = create_BOW() confusion = calculate_confusion_matrix( training_bags=training_bag_of_author, doc_counts=doc_count_of_author) # print(confusion) np.savetxt('confusion.txt', confusion, fmt='%d') print('tp : {:.2f}%'. format(100 * sum(np.diag(confusion)) / sum(sum(confusion))))