def main(): parser = argparse.ArgumentParser() parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)") args = parser.parse_args() vecs = WordVecs('/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS/BLSE/google.txt') en = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) langs = ['es', 'ca', 'eu'] for lang in langs: print('#### {0} ####'.format(lang)) cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) print('-binary-') best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en_binary._Xtrain, en_binary._ytrain) acc, prec, rec, f1 = scores(clf, binary_cross_dataset) print_prediction(clf, binary_cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-bi.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('prec: {0:.3f}'.format(prec)) print('rec: {0:.3f}'.format(rec)) print('f1: {0:.3f}'.format(f1)) print('-fine-') best_c, best_f1 = get_best_C(en, cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en._Xtrain, en._ytrain) acc, prec, rec, f1 = scores(clf, cross_dataset) print_prediction(clf, cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-4cls.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('prec: {0:.3f}'.format(prec)) print('rec: {0:.3f}'.format(rec)) print('f1: {0:.3f}'.format(f1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-sl', '--source_lang', help="source language: es, ca, eu, en (default: en)", default='en') parser.add_argument('-tl', '--target_lang', help="target language: es, ca, eu, en (default: es)", default='es') parser.add_argument('-bi', '--binary', help="binary or 4-class (default: True)", default=True, type=str2bool) parser.add_argument('-e', '--epochs', help="training epochs (default: 200)", default=200, type=int) parser.add_argument( '-a', '--alpha', help= "trade-off between projection and classification objectives (default: .001)", default=.1, type=float) parser.add_argument('-pl', '--proj_loss', help="projection loss: mse, cosine (default: cosine)", default='cosine') parser.add_argument('-bs', '--batch_size', help="classification batch size (default: 50)", default=21, type=int) parser.add_argument( '-sv', '--src_vecs', help=" source language vectors (default: GoogleNewsVecs )", default='embeddings/original/google.txt') parser.add_argument( '-tr', '--trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='bingliu') parser.add_argument( '-da', '--dataset', help="dataset to train and test on (default: opener_sents)", default='opener_sents', ) parser.add_argument( '-sd', '--savedir', help="where to dump weights during training (default: ./models)", default='models') parser.add_argument( '-lr', '--learning_rate', help="where to dump weights during training (default: 0.0001)", default=0.0001, type=float) parser.add_argument( '-m', '--model', help="where to dump weights during training (default: attn_rnn_blse)", default='rnn_attn_blse') parser.add_argument( '-cu', '--to_cuda', help="where to dump weights during training (default: True)", default=True, type=bool) args = parser.parse_args() # If there's no savedir, create it if args.model not in ['rnn_attn_blse', 'rnn_blse']: print("no such model: {}".format(args.model)) exit(1) os.makedirs(args.savedir, exist_ok=True) if args.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' weight_dir = "{}/{}/{}-{}-{}".format(args.savedir, args.model, args.dataset, args.target_lang, b) results_file_name = "results/report_{}_alpha-{}_batch_size-{}_epochs-{}_lr-{}.txt".format( args.model, args.alpha, args.batch_size, args.epochs, '{0:.15f}'.format(args.learning_rate).rstrip('0').rstrip('.')) # import datasets (representation will depend on final classifier) print() print('training model') print('Parameters:') print('model: {0}'.format(args.model)) print('binary: {0}'.format(b)) print('epochs: {0}'.format(args.epochs)) print('alpha (projection loss coef): {0}'.format(args.alpha)) print('batchsize: {0}'.format(args.batch_size)) print('learning rate: {0}'.format(args.learning_rate)) print('weight_dir: {0}'.format(weight_dir)) print('results_file_name: {0}'.format(results_file_name)) print() print('importing datasets') dataset = General_Dataset(os.path.join('datasets', args.source_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) # print("len(cross_dataset._Xdev): {}".format(len(cross_dataset._Xdev))) # print("len(cross_dataset._Xtest): {}".format(len(cross_dataset._Xtest))) # Import monolingual vectors print('importing word embeddings') trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format( args.target_lang) print("trg_vecs_file_path: {}".format(trg_vecs_file_path)) src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(trg_vecs_file_path) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs) # Import translation pairs translation_file_path = "lexicons/{}/en-{}.txt".format( args.trans, args.target_lang) print("translation_file_path: {}".format(translation_file_path)) pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs) # Set up model if args.model == 'rnn_blse': model = RNN_BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, batch_size=args.batch_size, to_cuda=args.to_cuda, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) elif args.model == 'rnn_attn_blse': model = Rnn_Attn_BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, to_cuda=args.to_cuda, batch_size=args.batch_size, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) if torch.cuda.is_available() and args.to_cuda: print("cuda is available") model.cuda() else: print("cuda is not available") # Loss Functions class_criterion = nn.CrossEntropyLoss() proj_criterion = nn.MSELoss() if args.proj_loss == 'mse': proj_criterion = nn.MSELoss() elif args.proj_loss == 'cosine': proj_criterion = cosine_loss else: print("no projection criterion supported: {}".format(args.proj_loss)) exit(1) # Optimizer optim = torch.optim.Adam(model.parameters(), args.learning_rate) # Fit model results_file = open(results_file_name, "w+") trainer = Trainer(model, args.alpha, optim, args.learning_rate, class_criterion, proj_criterion, args.epochs, args.batch_size, results_file, weight_dir, args.to_cuda) best_model_file_path = trainer.train(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain) # Get best dev f1 and weights print("looking in dir: {}".format(weight_dir)) best_f1, best_params = get_best_model_params(best_model_file_path) best_model = torch.load(best_model_file_path) state_dict = best_model.state_dict() model.load_state_dict(state_dict) print() print('Dev set') print('best dev f1: {0:.3f}'.format(best_f1)) print('parameters: epochs {0} batch size {1} alpha {2} learning rate {3}'. format(*best_params)) results_file.write('\n') results_file.write('Dev set\n') results_file.write('best dev f1: {0:.3f}\n'.format(best_f1)) results_file.write( 'parameters: epochs {0} batch size {1} alpha {2}\n'.format( *best_params)) # Evaluate on test set model.eval() model.evaluate(cross_dataset._Xtest, cross_dataset._ytest, results_file=results_file, src=False) model.confusion_matrix(cross_dataset._Xtest, cross_dataset._ytest, src=False, results_file=results_file) results_file.close()
def test_embeddings(file, threshold, file_type): emotions = [ "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust" ] # Import dataset where each test example is the words in the tweet dataset = Fine_Grained_Emotion_Dataset('data', None, rep=words, threshold=threshold) print('Basic statistics') table = [] for i, emo in enumerate(emotions): train = dataset._ytrain[:, i].sum() test = dataset._ytest[:, i].sum() table.append((emo, train, test)) print(tabulate.tabulate(table, headers=['emotion', '#train', '#test'])) #### Get Parameters #### max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list( dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} print('Importing vectors') for line in open(file): try: split = line.split() word = split[0] vec = np.array(split[1:], dtype='float32') if word in vocab: wordvecs[word] = vec except ValueError: pass dim = len(vec) oov = len(vocab) - len(wordvecs) print('OOV: {0}'.format(oov)) # Add vectors for <unk> add_unknown_words(wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) # TODO: change this so I don't have to import vectors I don't need vecs = WordVecs(file) vecs._matrix = W vecs._w2idx = word_idx_map vecs.vocab_length, vecs.vector_size = W.shape ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs) # Get padded word indexes for all X Xtrain = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtrain ]) Xdev = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xdev ]) Xtest = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtest ]) #### Test Models #### names = ['LSTM', 'BiLSTM', 'CNN'] # Keep all mean and standard deviations of each emotion over datasets here all_emo_results = [] all_emo_std_devs = [] # Keep all mean and standard deviations of the averaged emotions here averaged_results = [] averaged_std_devs = [] # TEST EACH MODEL for name in names: print('Getting best parameters') dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params( name, dev_params_file, Xtrain, dataset._ytrain, Xdev, dataset._ydev, wordvecs, W) print('Testing {0}'.format(name)) # Keep the results for the 5 runs over the dataset model_results = [] model_average_results = [] # 5 runs to get average and standard deviation for i, it in enumerate(range(5)): print('Run: {0}'.format(i + 1)) # create and train a new classifier for each iteration if name == 'LSTM': model = create_LSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'BiLSTM': model = create_BiLSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'CNN': model = create_cnn(W, Xtrain.shape[1]) h = model.fit(Xtrain, dataset._ytrain, validation_data=[Xdev, dataset._ydev], nb_epoch=best_epoch, verbose=0) pred = model.predict(Xtest) pred = np.array([cutoff(x) for x in pred]) y = dataset._ytest emo_results = [] for j in range(len(emotions)): emo_y = y[:, j] emo_pred = pred[:, j] mm = MyMetrics(emo_y, emo_pred, one_hot=False, average='binary') acc = mm.accuracy() precision, recall, f1 = mm.get_scores() emo_results.append([acc, precision, recall, f1]) emo_results = np.array(emo_results) model_results.append(emo_results) # print('F1 scores') # for emo, result in zip(emotions, emo_results): # a, p, r, f = result # print('{0}: {1:.3f}'.format(emo, f)) ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0) mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred) model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1)) print( 'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}' .format(ave_acc, mic_prec, mic_rec, mic_f1)) print() model_results = np.array(model_results) model_average_results = np.array(model_average_results) average_model_results = model_results.mean(axis=0) model_std_dev_results = model_results.std(axis=0) overall_avg = model_average_results.mean(axis=0) overall_std = model_average_results.std(axis=0) all_emo_results.append(average_model_results) all_emo_std_devs.append(model_std_dev_results) averaged_results.append(overall_avg) averaged_std_devs.append(overall_std) return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim
def main(): parser = argparse.ArgumentParser() parser.add_argument('-sl', '--source_lang', help="source language: es, ca, eu, en (default: en)", default='en') parser.add_argument('-tl', '--target_lang', help="target language: es, ca, eu, en (default: es)", default='es') parser.add_argument('-bi', '--binary', help="binary or 4-class (default: True)", default=True, type=str2bool) parser.add_argument('-e', '--epochs', help="training epochs (default: 200)", default=200, type=int) parser.add_argument( '-a', '--alpha', help= "trade-off between projection and classification objectives (default: .001)", default=.001, type=float) parser.add_argument('-pl', '--proj_loss', help="projection loss: mse, cosine (default: cosine)", default='mse') parser.add_argument('-bs', '--batch_size', help="classification batch size (default: 50)", default=20, type=int) parser.add_argument( '-sv', '--src_vecs', help=" source language vectors (default: GoogleNewsVecs )", default='google.txt') parser.add_argument( '-tv', '--trg_vecs', help=" target language vectors (default: SGNS on Wikipedia)", default='sg-300-es.txt') parser.add_argument( '-tr', '--trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='lexicons/bingliu/en-es.txt') parser.add_argument( '-da', '--dataset', help="dataset to train and test on (default: opener_sents)", default='opener_sents', ) parser.add_argument( '-sd', '--savedir', help="where to dump weights during training (default: ./models)", default='models/blse') args = parser.parse_args() # import datasets (representation will depend on final classifier) print('importing datasets') dataset = General_Dataset(os.path.join('datasets', args.source_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(args.trg_vecs) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs) # Import translation pairs pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs) if args.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' # Set up model blse = BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) # If there's no savedir, create it os.makedirs(args.savedir, exist_ok=True) # Fit model blse.fit(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain, weight_dir=args.savedir, batch_size=args.batch_size, alpha=args.alpha, epochs=args.epochs) # Get best dev f1 and weights best_f1, best_params, best_weights = get_best_run(args.savedir) blse.load_weights(best_weights) print() print('Dev set') print('best dev f1: {0:.3f}'.format(best_f1)) print( 'parameters: epochs {0} batch size {1} alpha {2}'.format(*best_params)) # Evaluate on test set blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False) blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False, outfile=os.path.join( 'predictions', args.target_lang, 'blse', '{0}-{1}-alpha{2}-epoch{3}-batch{4}.txt'.format( args.dataset, b, args.alpha, best_params[0], args.batch_size))) blse.confusion_matrix(cross_dataset._Xtest, cross_dataset._ytest, src=False) blse.plot()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-vec_dir', default='../deployment/MUSE/', help=" directory that hold MUSE vectors") parser.add_argument('-dataset', default='opener', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() # Loop over the three languages for lang in ['ca', 'eu', 'es']: print('################ {0} ##############'.format(lang)) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs( os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-en.txt')) trg_vecs = WordVecs( os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-{0}.txt'.format(lang))) # Import datasets (representation will depend on final classifier) print('importing datasets') binary_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) fine_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) fine_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) # Train linear SVM classifier if True in args.bi: best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain) cpred = clf.predict(binary_cross_dataset._Xtest) cf1 = macro_f1(binary_cross_dataset._ytest, cpred) print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'muse', '{0}-bi.txt'.format(args.dataset))) print('-binary-') print('Acc: {0:.3f}'.format( clf.score(binary_cross_dataset._Xtest, binary_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print() if False in args.bi: best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain) cpred = clf.predict(fine_cross_dataset._Xtest) cf1 = macro_f1(fine_cross_dataset._ytest, cpred) print_prediction( clf, fine_cross_dataset, os.path.join('predictions', lang, 'muse', '{0}-4cls.txt'.format(args.dataset))) print('-fine-') print('Acc: {0:.3f}'.format( clf.score(fine_cross_dataset._Xtest, fine_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print()
30, 60 ]: clf = LinearSVC(C=c) clf.fit(dataset._Xtrain, dataset._ytrain) pred = clf.predict(dataset._Xdev) f1 = per_class_f1(dataset._ydev, pred).mean() if f1 > best_f1: best_f1 = f1 best_c = c return best_c, best_f1 if __name__ == '__main__': embeddingdir = '/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS' amazon_vecs = WordVecs( os.path.join(embeddingdir, 'SubjQuant/amazon-sg-300.txt')) twitter_vecs = WordVecs( os.path.join(embeddingdir, 'twitter_embeddings.txt')) pdataset = ProjectionDataset('lexicons/general_vocab.txt', amazon_vecs, twitter_vecs) books = Book_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True) dvd = DVD_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True) electronics = Electronics_Dataset(amazon_vecs, rep=ave_vecs, binary=True, one_hot=False) kitchen = Kitchen_Dataset(amazon_vecs, rep=ave_vecs, binary=True,
def run_model_on_datasets_with_embeddings(embedding_file, file_type): """ embedding_file: the word embeddings file file_type: word2vec, glove """ print('importing word embedding vectors...') vecs = WordVecs(embedding_file, file_type) # load the word2vec dictionary. dim = vecs.vector_size # dimensionality of the word embeddings # For collecting results to return results = [] std_devs = [] datasetNames = [ # 'sst_fine', # 'sst_binary', # 'opener', # 'sentube_auto', 'sentube_tablets', 'semeval', ] # train & test the model on every dataset above for datasetName in datasetNames: # dataset_load_start = datetime.now() if datasetName == 'sst_fine': dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=False, rep=words) elif datasetName == 'sst_binary': dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=True, rep=words) elif datasetName == 'opener': dataset = General_Dataset('datasets/opener', None, one_hot=True, rep=words) elif datasetName == 'sentube_auto': dataset = General_Dataset('datasets/SenTube/auto', None, rep=words, binary=True, one_hot=True) elif datasetName == 'sentube_tablets': dataset = General_Dataset('datasets/SenTube/tablets', None, rep=words, binary=True, one_hot=True) elif datasetName == 'semeval': dataset = Semeval_Dataset('datasets/semeval', None, rep=words, one_hot=True) print('Loading & Testing on {}:'.format(datasetName)) # if hp.lowercase_all_sentences: # for sent in dataset._Xtrain: # for word in sent: # if word != word.lower(): # print("Word has an uppercase character:", word.decode('utf-8')) # find out the max length of sentences in the dataset and construct the vocab frequency dict. max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 # create a dict of words that are in our word2vec embeddings # wordvecs: String -> embedding_vec wordvecs = {} for w in vecs._w2idx.keys(): if w in vocab: wordvecs[w] = vecs[w] # Assign random w2v vectors to the unknown words. These are random uniformly distrubuted vectors of size dim. add_unknown_words(vecs, wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) # Get the w2v index map for out final vocab print('Converting dataset to being right padded...') dataset = convert_dataset(dataset, word_idx_map, datasetName, max_length) output_dim = dataset._ytest.shape[1] # Test model hp.run_exps_amount times and get averages and std dev. dataset_results = [] for i in range(1, hp.run_exps_amount + 1): tf.reset_default_graph() # Clears the current loaded tensorflow graph. w2i = tf.Variable(tf.constant(0.0, shape=[W.shape[0], W.shape[1]]), trainable=False, name="embedding_table") wordIndxToVec_tensor = tf.placeholder(tf.float32, [W.shape[0], W.shape[1]], name="embedding_table") # [vobab_size x word_embedding_dim] w2i.assign(wordIndxToVec_tensor) start_time = datetime.now() # Print time for logging. clf, best_mm_val, best_mm_test = createAndTrainTransformer(dataset, W, wordIndxToVec_tensor, output_dim, datasetName, max_length) print("Finished run #", i, "Time taken: " + str(datetime.now() - start_time)) mm = best_mm_test acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) if hp.run_exps_amount == 1: acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) # add twice so the average is the same... avoid running multiple runs this way. if hp.run_exps_amount != 1: # Print the metrics for this run, unless we're running experiment only once. this_run_result = [] this_run_result.append([acc, precision, recall, micro_f1]) this_run_result.append([acc, precision, recall, micro_f1]) this_run_result = np.array(this_run_result) this_run_ave_results = this_run_result.mean(axis=0) this_run_std_results = this_run_result.std(axis=0) printMetrics(this_run_ave_results, this_run_std_results, datasetName) # Get the average and std deviation over 10 runs with 10 random seeds dataset_results = np.array(dataset_results) ave_results = dataset_results.mean(axis=0) std_results = dataset_results.std(axis=0) printMetrics(ave_results, std_results, datasetName) results.append(ave_results) std_devs.append(std_results) results.append(list(np.array(results).mean(axis=0))) std_devs.append(list(np.array(std_devs).mean(axis=0))) datasetNames.append('overall') return datasetNames, results, std_devs, dim
parser.add_argument('-lr', '--learning_rate', default=0.001, type=float) parser.add_argument('-wd', '--weight_decay', default=0.0, type=float) parser.add_argument('-cuda', default=True, type=str2bool) parser.add_argument('-seed', default=123, type=int) args = parser.parse_args() print_args(args) args.cuda = args.cuda and torch.cuda.is_available() np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print('Importing embeddings...') src_vecs = WordVecs(args.src_embeddings) trg_vecs = WordVecs(args.trg_embeddings) pdataset = ProjectionDataset( 'lexicons/bingliu_{0}_{1}.txt'.format(args.src_lang, args.trg_lang), src_vecs, trg_vecs) print('Importing datasets...') # Get training, dev, and test data if args.binary: train_data, dev_data, test_data = open_dataset( os.path.join('annotation', args.src_dataset, args.src_lang)) train_data = [(l, r, t, y) for l, r, t, y in train_data if y in [0, 2]] train_data = [(l, r, t, y) if y == 0 else (l, r, t, 1) for l, r, t, y in train_data] dev_data = [(l, r, t, y) for l, r, t, y in dev_data if y in [0, 2]]
def train_model_with_different_params(params): # Train a certain model (rnn_blse / rnn_attn_blse) for a certain target language # for different combinations of hyper parameters if params.model not in ['rnn_attn_blse', 'rnn_blse']: print("no such model: {}".format(params.model)) exit(1) # If there's no savedir, create it os.makedirs(params.savedir, exist_ok=True) if params.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' weight_dir = "{}/{}/{}-{}-{}".format(params.savedir, params.model, params.dataset, params.target_lang, b) best_params_file_name = "results/best_params_report_{}_{}_{}.txt".format( params.model, params.target_lang, b) best_params_file = open(best_params_file_name, "w+") best_params_file.write("Start parameter search:\n") best_params_file.write("Model: {}\n".format(params.model)) best_params_file.write("is_binary: {}\n".format(params.binary)) best_params_file.write("target_lang: {}\n".format(params.target_lang)) best_f1 = 0.0 best_params = None old_file_name = None old_results_file_name = None rest_of_scores = [] print('importing datasets') dataset = General_Dataset(os.path.join('datasets', params.source_lang, params.dataset), None, binary=params.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', params.target_lang, params.dataset), None, binary=params.binary, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format( params.target_lang) print("trg_vecs_file_path: {}".format(trg_vecs_file_path)) src_vecs = WordVecs(params.src_vecs) trg_vecs = WordVecs(trg_vecs_file_path) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(params.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(params.target_lang, trg_vecs) # Import translation pairs translation_file_path = "lexicons/{}/en-{}.txt".format( params.trans, params.target_lang) print("translation_file_path: {}".format(translation_file_path)) pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs) for proj_loss in params.proj_losses: for alpha in params.alphas: for learning_rate in params.learning_rates: for batch_size in params.batch_sizes: best_model_file_path, acc, prec, rec, f1, results_file_name = train_model( params.model, dataset, cross_dataset, src_vecs, trg_vecs, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, pdataset, weight_dir, proj_loss, alpha, learning_rate, batch_size, output_dim, b, params) if f1 > best_f1: print() print("Found new set of best hyper params:") print("f1: {0:.3f}".format(f1)) print("acc: {0:.3f}".format(acc)) print("prec: {0:.3f}".format(prec)) print("rec: {0:.3f}".format(rec)) print('model: {0}'.format(params.model)) print('is_binary: {0}'.format(params.binary)) print('epochs: {0}'.format(params.epochs)) print('proj_loss: {0}'.format(proj_loss)) print('alpha (projection loss coef): {0}'.format( alpha)) print('batch size: {0}'.format(batch_size)) print('learning rate: {0}'.format(learning_rate)) print('weight_dir: {0}'.format(weight_dir)) print('best_model_file_path: {0}'.format( best_model_file_path)) print() best_params_file.write("\n") best_params_file.write( "Found new set of best hyper params:\n") best_params_file.write( "f1 {0:.3f}:\n".format(f1)) best_params_file.write( "acc {0:.3f}:\n".format(acc)) best_params_file.write( "prec {0:.3f}:\n".format(prec)) best_params_file.write( "rec {0:.3f}:\n".format(rec)) best_params_file.write('model: {0}\n'.format( params.model)) best_params_file.write('is_binary: {0}\n'.format( params.binary)) best_params_file.write('epochs: {0}\n'.format( params.epochs)) best_params_file.write( 'proj_loss: {0}\n'.format(proj_loss)) best_params_file.write( "alpha (projection loss coef): {0}\n".format( alpha)) best_params_file.write( 'batch size: {0}\n'.format(batch_size)) best_params_file.write( 'learning: {0}\n'.format(learning_rate)) best_params_file.write( 'weight_dir: {0}\n'.format(weight_dir)) best_params_file.write( 'best_model_file_path: {0}\n'.format( best_model_file_path)) if old_file_name != None: os.remove(old_file_name) if old_results_file_name != None: os.remove(old_results_file_name) torch.save(params.model, best_model_file_path) old_file_name = best_model_file_path old_results_file_name = results_file_name best_f1 = f1 rest_of_scores = [acc, prec, rec] best_params = [ proj_loss, alpha, learning_rate, batch_size ] else: os.remove(results_file_name) os.remove(best_model_file_path) print("") print("Done parameters search") print("best f1: {0:.3f}".format(best_f1)) print("its acc: {0:.3f}".format(rest_of_scores[0])) print("its prec: {0:.3f}".format(rest_of_scores[1])) print("its rec: {0:.3f}".format(rest_of_scores[2])) print("best_params:") print('model: {0}'.format(params.model)) print('is_binary: {0}'.format(params.binary)) print('proj_loss: {0}'.format(best_params[0])) print('alpha (projection loss coef): {0}'.format(best_params[1])) print('learning rate: {0}'.format(best_params[2])) print('batch size: {0}'.format(best_params[3])) print("") best_params_file.write("\n") best_params_file.write("Done parameters search\n") best_params_file.write("best f1: {0:.3f}\n".format(best_f1)) best_params_file.write("its acc: {0:.3f}\n".format(rest_of_scores[0])) best_params_file.write("its prec: {0:.3f}\n".format(rest_of_scores[1])) best_params_file.write("its rec: {0:.3f}\n".format(rest_of_scores[2])) best_params_file.write('model: {0}\n'.format(params.model)) best_params_file.write('is_binary: {0}\n'.format(params.binary)) best_params_file.write('proj_loss: {0}\n'.format(best_params[0])) best_params_file.write("alpha (projection loss coef): {0}\n".format( best_params[1])) best_params_file.write('learning: {0}\n'.format(best_params[2])) best_params_file.write('batch size: {0}\n'.format(best_params[3])) best_params_file.close()
def test_embeddings(embedding_file, file_type): """ Tang et al. (2014) embeddings and cassification approach on a number of benchmark datasets. """ print('importing vectors...') vecs = WordVecs(embedding_file, file_type) dim = vecs.vector_size print('Importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=False, binary=False, rep=words) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=False, binary=True, rep=words) opener_dataset = General_Dataset('datasets/opener', vecs, one_hot=False, rep=words) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', vecs._w2idx, rep=words, binary=True, one_hot=False) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', vecs._w2idx, rep=words, binary=True, one_hot=False) semeval_dataset = Semeval_Dataset('datasets/semeval', vecs._w2idx, rep=words, one_hot=False) datasets = [ st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset ] names = [ 'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval' ] # Collect results here results = [] for name, dataset in zip(names, datasets): print('Testing on {0}...'.format(name)) Xtrain = np.array( [conv_tweet(' '.join(t), vecs) for t in dataset._Xtrain]) Xtest = np.array( [conv_tweet(' '.join(t), vecs) for t in dataset._Xtest]) Xdev = np.array([conv_tweet(' '.join(t), vecs) for t in dataset._Xdev]) # get best parameters on dev set best_C, best_rate = get_best_C(Xtrain, dataset._ytrain, Xdev, dataset._ydev) clf = LogisticRegression(C=best_C) h = clf.fit(Xtrain, dataset._ytrain) pred = clf.predict(Xtest) predictions_file = "predictions/joint/" + name + '/pred.txt' print_prediction(predictions_file, pred) labels = sorted(set(dataset._ytrain)) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, one_hot=False, labels=labels, average=average) acc, precision, recall, f1 = mm.get_scores() results.append([acc, precision, recall, f1]) results.append(list(np.array(results).mean(axis=0))) names.append('overall') return names, results, dim
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', help="target language: es, ca, eu", default='es') parser.add_argument('-bi', help="binary or 4-class", default=False, type=str2bool) parser.add_argument('-epoch', default=300, type=int) parser.add_argument('-alpha', default=.5, type=float) parser.add_argument('-batch_size', default=200, type=int) parser.add_argument('-src_vecs', default='embeddings/original/google.txt') parser.add_argument('-trg_vecs', default='embeddings/original/sg-300-es.txt') parser.add_argument( '-trans', help='translation pairs', default= 'lexicons/bingliu_en_es.one-2-one_AND_Negators_Intensifiers_Diminishers.txt' ) parser.add_argument('-dataset', default='opener') args = parser.parse_args() # import datasets (representation will depend on final classifier) print('importing datasets') dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), None, binary=args.bi, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.l, args.dataset), None, binary=args.bi, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(args.trg_vecs) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant('en', src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.l, trg_vecs) # Import translation pairs pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs) # initialize classifier if args.bi: ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, 2) else: ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, 4) # train model print('training model') print('Parameters:') print('lang: {0}'.format(args.l)) print('binary: {0}'.format(args.bi)) print('epoch: {0}'.format(args.epoch)) print('alpha: {0}'.format(args.alpha)) print('batchsize: {0}'.format(args.batch_size)) print('src vecs: {0}'.format(args.src_vecs)) print('trg_vecs: {0}'.format(args.trg_vecs)) print('trans dict: {0}'.format(args.trans)) print('dataset: {0}'.format(args.dataset)) if args.bi: b = 'bi' else: b = '4cls' weight_dir = os.path.join('models', '{0}-{1}-{2}'.format(args.dataset, args.l, b)) ble.fit(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain, weight_dir=weight_dir, alpha=args.alpha, epochs=args.epoch, batch_size=args.batch_size) # get the best weights best_f1, best_params, best_weights = get_best_run(weight_dir) epochs, batch_size, alpha = best_params ble.load_weights(best_weights) # evaluate if args.bi: ble.plot(outfile=os.path.join( 'figures', 'syn-ant', args.l, 'ble', '{0}-bi-alpha{1}-epoch{2}-batch{3}.pdf'.format( args.dataset, alpha, epochs, batch_size))) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False, outfile=os.path.join( 'predictions', args.l, 'ble', '{0}-bi-alpha{1}-epoch{2}-batch{3}.txt'.format( args.dataset, alpha, epochs, batch_size))) else: ble.plot(outfile=os.path.join( 'figures', 'syn-ant', args.l, 'ble', '{0}-4cls-alpha{1}-epoch{2}-batch{3}.pdf'.format( args.dataset, alpha, epochs, batch_size))) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, average='macro', src=False) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, average='macro', src=False, outfile=os.path.join( 'predictions', args.l, 'ble', '{0}-4cls-alpha{1}-epoch{2}-batch{3}.txt'.format( args.dataset, alpha, epochs, batch_size)))
def test_embeddings(file, file_type): print('Importing vecs...') #vec_file = sys.argv[1] #vec_file = '/home/jeremy/Escritorio/sentiment_retrofitting/embeddings/sswe-u-50.txt' vecs = WordVecs(file) print('Importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=False, rep=words) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=True, rep=words) opener_dataset = General_Dataset('datasets/opener', vecs, one_hot=True, rep=word_reps) twitter_dataset = Semeval_Dataset('datasets/twitter', vecs._w2idx, rep=word_reps, one_hot=True) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', vecs._w2idx, rep=word_reps, binary=True, one_hot=True) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', vecs._w2idx, rep=word_reps, binary=True, one_hot=True) semeval_dataset = Semeval_Dataset('datasets/semeval', vecs, rep=words, one_hot=True) datasets = [ st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset ] names = [ 'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval' ] dim = vecs.vector_size for name, dataset in zip(names, datasets): print('Testing on {0}...'.format(name)) max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list( dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} for w in vecs._w2idx.keys(): if w in vocab: wordvecs[w] = vecs[w] add_unknown_words(wordvecs, vocab, min_df=1, k=dim) W, word_idx_map = get_W(wordvecs, k=dim) print('Converting and Padding dataset...') dataset = convert_dataset(dataset, word_idx_map, max_length) output_dim = dataset._ytest.shape[1] """ Get best Dev params =========================================================== """ dev_params_file = 'dev_params/' + str(W.shape[1]) + '_cnn.dev.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params( name, dev_params_file, max_length, dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, W) # Collect results here results = [] std_devs = [] for i, it in enumerate(range(5)): dataset_results = [] checkpoint = ModelCheckpoint( 'models/cnn/' + name + '/run' + str(i + 1) + '/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto') clf = create_cnn(W, max_length, dim=best_dim, dropout=best_dropout, output_dim=output_dim) h = clf.fit(dataset._Xtrain, dataset._ytrain, validation_data=[dataset._Xdev, dataset._ydev], epochs=best_epoch, verbose=1, callbacks=[checkpoint]) base_dir = 'models/cnn/' + name + '/run' + str(i + 1) weights = os.listdir(base_dir) best_val = 0 best_weights = '' for weight in weights: val_acc = re.sub('weights.[0-9]*-', '', weight) val_acc = re.sub('.hdf5', '', val_acc) val_acc = float(val_acc) if val_acc > best_val: best_val = val_acc best_weights = weight clf = load_model(os.path.join(base_dir, best_weights)) pred = clf.predict(dataset._Xtest, verbose=1) classes = clf.predict_classes(dataset._Xtest, verbose=1) prediction_file = 'predictions/cnn/' + name + '/run' + str( i + 1) + '/pred.txt' w2idx_file = 'predictions/cnn/' + name + '/w2idx.pkl' print_prediction(prediction_file, classes) with open(w2idx_file, 'wb') as out: pickle.dump(word_idx_map, out) labels = sorted(set(dataset._ytrain.argmax(1))) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, labels=labels, average=average) acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) return names, results, std_devs, dim
parser = argparse.ArgumentParser() parser.add_argument("--NUM_LAYERS", "-nl", default=1, type=int) parser.add_argument("--HIDDEN_DIM", "-hd", default=100, type=int) parser.add_argument("--BATCH_SIZE", "-bs", default=50, type=int) parser.add_argument("--EMBEDDING_DIM", "-ed", default=300, type=int) parser.add_argument("--TRAIN_EMBEDDINGS", "-te", action="store_true") parser.add_argument("--AUXILIARY_TASK", "-aux", default="negation_scope") parser.add_argument("--EMBEDDINGS", "-emb", default="../../embeddings/google.txt") args = parser.parse_args() print(args) # Get embeddings (CHANGE TO GLOVE OR FASTTEXT EMBEDDINGS) embeddings = WordVecs(args.EMBEDDINGS) w2idx = embeddings._w2idx # Create shared vocabulary for tasks vocab = Vocab(train=True) # Update with word2idx from pretrained embeddings so we don't lose them # making sure to change them by one to avoid overwriting the UNK token # at index 0 with_unk = {} for word, idx in embeddings._w2idx.items(): with_unk[word] = idx + 1 vocab.update(with_unk) # Import datasets # This will update vocab with words not found in embeddings
def main(): parser = argparse.ArgumentParser() parser.add_argument('-dataset', default='opener', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() langs = ['es', 'ca', 'eu'] for lang in langs: print('#### {0} ####'.format(lang)) en = General_Dataset(os.path.join('datasets', 'en', args.dataset), None, one_hot=False, rep=words) cross_dataset = General_Dataset(os.path.join('datasets', lang, args.dataset), None, one_hot=False, rep=words) vocab = en.vocab.update(cross_dataset.vocab) vecs = WordVecs( 'embeddings/barista/sg-300-window4-negative20_en_{0}.txt'.format( lang), vocab=vocab) en = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) cross_dataset = General_Dataset(os.path.join('datasets', lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) if True in args.bi: print('-binary-') best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en_binary._Xtrain, en_binary._ytrain) acc, f1 = scores(clf, binary_cross_dataset, 'binary') print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'barista', '{0}-bi.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('f1: {0:.3f}'.format(f1)) if False in args.bi: print('-fine-') best_c, best_f1 = get_best_C(en, cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en._Xtrain, en._ytrain) acc, f1 = scores(clf, cross_dataset) print_prediction( clf, cross_dataset, os.path.join('predictions', lang, 'barista', '{0}-4cls.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('f1: {0:.3f}'.format(f1))