def test(): glv, glv_w2i, glv_vocab = load_embedding("data/glove.txt") words_sorted = compute_word_bias(glv, glv_w2i, glv_vocab) for n in [100, 500, 1000]: my_cluster(glv, glv_w2i, 1, glv_vocab, words_sorted, n) hard_debias() # Uncomment to create the hard_debias word vector embedding hd_glv, hd_glv_w2i, hd_glv_vocab = load_embedding("hard_debias.txt") for n in [100, 500, 1000]: my_cluster(hd_glv, hd_glv_w2i, 1, hd_glv_vocab, words_sorted, n) embedding_filepath = './data/glove.txt' male_filepath = './data/male_words.txt' female_filepath = './data/female_words.txt' pairs_filepath = './data/definitional_pairs.json' dbl_glv, dbl_w2i, dbl_vocab = double_hard_debias(embedding_filepath, male_filepath, female_filepath, pairs_filepath) for n in [100, 500, 1000]: my_cluster(dbl_glv, dbl_w2i, 1, dbl_vocab, words_sorted) #test()
def __init__(self, vocab_size, embedding_dim, hidden_dim, tag2idx, batch_size, use_gpu, idx2word, emb_path): super(AttentionModel, self).__init__() self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.tag2idx = tag2idx self.target_size = len(tag2idx) self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True) self.hidden2tags = nn.Linear(hidden_dim, self.target_size) self.batch_size = batch_size self.use_gpu = use_gpu self.idx2word = idx2word self.emb_path = emb_path # pretrain embeddings emb_vectors = load_embedding(self.emb_path, self.idx2word) self.embeds = nn.Embedding.from_pretrained( torch.from_numpy(emb_vectors).float(), freeze=True) self.dropout = torch.nn.Dropout(0.5) self.query = nn.Parameter(torch.randn(self.hidden_dim), requires_grad=True)
def main(): device = torch.device('cuda' if torch.cuda.is_available() and args.use_gpu else 'cpu') try: _, dataset, embed, model_type, model_name = args.model_path.split('/') model = DelhateEnsemble.load_model(args.model_path) except FileNotFoundError: raise embedding, dim = utils.load_embedding(model.embed_corpus) test_data = utils.load_dataset(args.dataset, 'test', embedding, labeled=True, pad=model.seq_length) model.to(device) y_pred, y_true = model.evaluate(test_data, device=device) print('pred:', Counter(y_pred)) print('true:', Counter(y_true)) report = classification_report(y_true, y_pred, target_names=['H', 'O', 'N'], digits=3) conf_mat = confusion_matrix(y_true, y_pred) model_name = model_name.replace('.pt', '') out_path = f'metrics/{dataset.upper()}/{embed}/{model_type}' os.makedirs(out_path, exist_ok=True) with open(f'{out_path}/{model_name}_{args.dataset}.txt', 'w') as f: f.write(report) f.write('\n') f.write('\n'.join(' '.join(str(x) for x in y) for y in conf_mat)) f.write('\n')
def main(): args = docopt(__doc__) enable_all_pools = args['--enable-all-pools'] hidden = int(args['--hidden']) dropout = float(args['--dropout']) device = torch.device(int(args['--device'])) print(f"{device} will be used") ratio = 0.8 valid_dset = QueryDataset(split='valid', ratio=ratio, equally_handle_foreign_authors=False) valid_loader = DataLoader(valid_dset, batch_size=1, num_workers=1, shuffle=False) embedding_mode, embedding = load_embedding(args['--embedding'], False, device) classifier = Classifier(embedding, hidden, dropout, args['--deepset'], equally_handle_foreign_authors=False, enable_all_pools=enable_all_pools) classifier.load_state_dict(torch.load(args['--classifier'])) classifier.eval() if torch.cuda.is_available(): classifier.to(device) thresholds = [0.05 * i for i in range(1, 20)] for thres in thresholds: test_classifier(valid_loader, classifier, device, thres)
def test_load_embedding(self): print '=======================================' print '\n\nload_embedding:' lines = self.loadcorpus() train_features, train_labels, f_map, _, c_map = utils.generate_corpus_char( lines, if_shrink_c_feature=True, c_thresholds=5, if_shrink_w_feature=False) f_set = {v for v in f_map} # map: return a new list based on old list # reduce: accumulate values and operate it with new values. dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), train_features), f_set) f_map = utils.shrink_features(f_map, train_features, 5) f_map, embedding_tensor, in_doc_words = utils.load_embedding( '/datastore/liu121/nosqldb2/acl_hscrf/skipgram', ' ', f_map, dt_f_set, 'unk', 200, shrink_to_corpus=True, embsave_filePath= '/datastore/liu121/nosqldb2/acl_hscrf/pkl/analysis_table.pkl')
def __init__(self, args): self.batch_size = args.batch_size self.hidden_size = args.hidden_size self.emb_size = args.emb_size self.emb_trainable = args.emb_trainable self.load_glove = args.load_glove self.num_max_epochs = args.num_max_epochs self.learning_rate = args.learning_rate # Load data data_loader = None if args.dataset == 'moviereview': data_loader = MRDataLoader elif args.dataset == 'senti140': data_loader = S140DataLoader else: print 'wrong data' sys.exit(1) loader = data_loader(data_path='../data/%s/' % (args.dataset), pad_size=20, max_vocab=100000) loader.read_data() self.num_class = loader.num_class self.vocab = loader.vocab self.vocab_rev = {w: i for i, w in enumerate(loader.vocab)} self.vocab_size = len(loader.vocab) # Data iterators self.train_iter = PaddingDatawithTarget(loader.train) self.test_iter = PaddingDatawithTarget(loader.test) # Load glove if self.load_glove: # self.emb = load_glove( # emb_path = '../data/glove.6B/', # emb_filename= 'glove.6B.300d.txt', # 'test.txt', # # vocab = self.vocab, # emb_size = self.emb_size) # self.emb_size = self.emb.shape[1] #NOTE I change loading binary file, and different directory self.emb = load_embedding( emb_path='/data/word2vec/', emb_filename='glove.42B.300d.w2v.bin', # 'test.txt', # vocab=self.vocab, emb_size=self.emb_size) self.emb_size = self.emb.shape[1] print ' '.join([self.vocab[w] for w in loader.train['X'][0]]) print loader.train['length'][0], loader.train['Y'][0] print ' '.join([self.vocab[w] for w in loader.train['X'][1]]) print loader.train['length'][1], loader.train['Y'][1] print loader.train['Y'][:10] print loader.test['Y'][:10] #import pdb; pdb.set_trace() self.sess = None
def __init__(self, args): self.out_path = args.out_path self.data_path = args.data_path self.target = args.target self.emb = load_embedding(emb_path='/data/word2vec/', emb_filename='glove.42B.300d.w2v.bin') self.emb_size = len(self.emb['the'])
def __init__(self, args): super().__init__() self.word_embeddings = nn.Embedding(args.vocab_size, args.embedding_size, padding_idx=0) self.dropout = nn.Dropout(args.embedding_dropout_prob) embedding = load_embedding(args) self.word_embeddings.weight.requires_grad = not args.fix_embedding # False self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding))
def init(): path = config.data_path config.embedding_file = os.path.join(path, config.embedding_file) config.embedding_vocab = os.path.join(path, config.embedding_vocab) config.train_file = os.path.join(path, config.train_file) config.test_file = os.path.join(path, config.test_file) # Config log if config.log_file is None: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') else: if not os.path.exists(config.save_path): os.makedirs(config.save_path) logging.basicConfig(filename=config.log_file, filemode='a', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') # Load data # data = (sentences, relations, e1_pos, e2_pos) train_data = utils.load_data(config.train_file) test_data = utils.load_data(config.test_file) logging.info('trian data: %d' % len(train_data[0])) logging.info('test data: %d' % len(test_data[0])) # Build vocab word_dict = utils.build_dict(train_data[0] + test_data[0]) logging.info('total words: %d' % len(word_dict)) embeddings = utils.load_embedding(config, word_dict) # Log parameters flags = config.__dict__['__flags'] flag_str = "\n" for k in flags: flag_str += "\t%s:\t%s\n" % (k, flags[k]) logging.info(flag_str) # vectorize data # vec = (sents_vec, relations, e1_vec, e2_vec, dist1, dist2) max_len_train = len(max(train_data[0], key=lambda x: len(x))) max_len_test = len(max(test_data[0], key=lambda x: len(x))) max_len = max(max_len_train, max_len_test) config.max_len = max_len train_vec = utils.vectorize(train_data, word_dict, max_len) test_vec = utils.vectorize(test_data, word_dict, max_len) return embeddings, train_vec, test_vec
def __init__(self, vocab_size, embedding_dim, hidden_dim, tag2idx, batch_size, use_gpu, idx2word, emb_path): super(bilstm_crf, self).__init__() self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.tag2idx = tag2idx self.target_size = len(tag2idx) self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True) self.hidden2tags = nn.Linear(hidden_dim, self.target_size) self.batch_size = batch_size self.use_gpu = use_gpu self.idx2word = idx2word self.emb_path = emb_path # pretrain embeddings emb_vectors = load_embedding(self.emb_path, self.idx2word) self.embeds = nn.Embedding.from_pretrained(torch.from_numpy(emb_vectors).float(), freeze=True) # V x D
def main(): device = torch.device( 'cuda' if torch.cuda.is_available() and args.use_gpu else 'cpu') rnn_str = args.rnn_type if args.rnn_type else 'cnn' weak_str = '_weak' if args.weak_loss else '' out_path = f'models/{args.dataset}/{args.embed_corpus}/delhate_{rnn_str}{weak_str}' os.makedirs(out_path, exist_ok=True) embedding, dim = utils.load_embedding(args.embed_corpus) labeled = not args.weak_loss train_data = utils.load_dataset(args.dataset, 'train', embedding, labeled, args.pad) model = DelhateEnsemble(n_models=args.n_models, seq_length=train_data.padded_seq, embed_corpus=args.embed_corpus, embed_dim=dim, n_classes=train_data.n_classes, n_filters=args.n_filters, filter_width=args.filter_width, pool_size=args.pool_size, n_hidden=args.n_hidden, rnn_type=args.rnn_type, dropout=args.dropout) if args.weak_loss: loss_fn = lambda x, y: utils.weak_loss(x, y, weight=args.class_weight) else: loss_fn = F.cross_entropy model.train_models(train_data, loss_fn=loss_fn, lr=args.learn_rate, n_samples=args.n_samples, use_val=args.use_val, early_stop=args.early_stop, batch_size=args.batch_size, EPOCHS=args.epochs, device=device) model.save_model(f'{out_path}/{args.model_name}.pt')
def main(argv=None): if FLAGS.non_linearity == 'tanh': non_linearity = tf.nn.tanh elif FLAGS.non_linearity == 'sigmoid': non_linearity = tf.nn.sigmoid else: non_linearity = tf.nn.relu train_url = os.path.join(FLAGS.data_dir, 'train.feat') test_url = os.path.join(FLAGS.data_dir, 'test.feat') vocab_url = os.path.join(FLAGS.data_dir, 'vocab.new') model_url = os.path.join(FLAGS.model_dir, '') train( train_url=train_url, test_url=test_url, vocab_url=vocab_url, model_url=model_url, non_linearity=non_linearity, embedding_url=FLAGS.embedding_file, training_epochs=FLAGS.training_epochs, alternate_epochs=FLAGS.alternate_epochs, vocab_size=FLAGS.vocab_size, embedding_size=FLAGS.embedding_size, n_hidden=FLAGS.n_hidden, n_topic=FLAGS.n_topic, n_sample=FLAGS.n_sample, learning_rate=FLAGS.learning_rate, batch_size=FLAGS.batch_size, is_training=True, mix_num=FLAGS.mix_num, ) # ------------------ print top words ---------------------------- with tf.Session() as sess: saver = tf.train.Saver() saver.restore(sess, model_url) # find the names of all variable for v in tf.trainable_variables(): print(v.name, v.shape) embedding_table = utils.load_embedding( embedding_url, embedding_size, vocab, FLAGS.data_dir + '/vocab_embedding-{}.pkl'.format(embedding_size)) TopicWords(sess, vocab_url, embedding_table)
def main(): args = docopt(__doc__) device = torch.device(int(args['--device'])) print(f"{device} will be used") threshold = float(args['--threshold']) answer_path = args['--answer-path'] query_path = args['--query-path'] hidden = int(args['--hidden']) dropout = float(args['--dropout']) enable_all_pools = args['--enable-all-pools'] if os.path.exists(answer_path): warnings.warn( 'Answer file already exists. Please delete it before run the code. Otherwise, lines will be appended.' ) testset = QueryTestset(query_path) testloader = DataLoader(testset, batch_size=1, num_workers=1, shuffle=False) embedding_mode, embedding = load_embedding(args['--embedding'], None, device) classifier = Classifier(embedding, hidden, dropout, args['--deepset'], False, enable_all_pools=enable_all_pools).to(device) classifier.load_state_dict(torch.load(args['--classifier'])) classifier.eval() with torch.no_grad(): for collab in testloader: score = classifier(collab.to(device)) if score >= threshold: answer = True else: answer = False with open(answer_path, 'a') as f: f.write(str(answer) + '\n')
def hard_debias(path_to_embedding="Double-Hard Debias/embeddings/glove.txt", path_to_def_pairs="Hard Debias/Data/definitional_pairs.json"): word_vectors, word_indices, vocab = load_embedding(path_to_embedding) word_vectors = np.asarray(word_vectors) with open(path_to_def_pairs) as f: set_of_pairs = json.load(f) mu_list = calculate_mu(set_of_pairs, word_vectors, word_indices) gender_subspace = calculate_gender_direction(set_of_pairs, mu_list, word_vectors, word_indices, num_components=1) gender_direction = gender_subspace[0] ### Subtracting Gender Bias from each Word Vector for i in range(len(word_vectors)): word_vectors[i] = word_vectors[i] - np.dot( word_vectors[i], gender_direction) * gender_direction word_vectors = normalize(word_vectors) recreate_embedding(word_vectors, vocab, "hard_debias")
def main(args): df_gold = pd.read_csv(args.goldstandard, index_col=0) for emb_path in args.embedding: print("=" * 78) print("Processing embedding file:", emb_path) print("-" * 78) df_embedding = load_embedding(emb_path, as_dataframe=True) # align embedding and gold standard df = df_gold.join(df_embedding, how='inner') # df = pd.merge(df_gold, df_embedding, left_index=True, right_index=True, how='inner') le = preprocessing.LabelEncoder() y = le.fit_transform(df['top'].values) # First column is label column X = df[df.columns[1:]].values print("N examples", X.shape[0]) print("N targets", len(le.classes_)) if args.normalize: print("Normalizing...") X = preprocessing.normalize(X, norm='l2') # Linear SVM with default parameters clf = svm.SVC(kernel=args.kernel) print("Running {}-cross-validated SVM with {} kernel...".format(args.cv, args.kernel)) scores = cross_val_score(clf, X, y, cv=args.cv) print("Accuracy scores", scores) print("Accuracy mean/std:", scores.mean(), scores.std()) print("=" * 78)
def clip_embedding_matrix(embedding_file, input_files, output_dir, embedding_name): vocab_file = os.path.join(output_dir, 'vocab.txt') clipped_file = os.path.join(output_dir, embedding_name) # load all files and build the vocabulary all_texts = load_all_texts(input_files) tokenizer = Tokenizer(num_words=None, lower=False) tokenizer.fit_on_texts(all_texts) logger.info("the size of vocabulary is {}".format( len(tokenizer.word_counts))) # load word vector and build embedding matrix embeddings_index = load_embedding(embedding_file) embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index) logger.info("the shape of embedding matrix is {}".format( embedding_matrix.shape)) # save embedding matrix and vocabulary np.save(clipped_file, embedding_matrix) # save embedding matrix # save vocabulary words = [word + '\n' for word in list(tokenizer.word_index.keys())] with open(vocab_file, 'w', encoding='utf-8') as f: f.writelines(words)
test_summary_dir = os.path.join(out_dir, "summaries", timestamp, "test") test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) test_summary_writer.add_graph(sess.graph) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints", timestamp)) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) merged_summary = tf.summary.merge_all() log('summary', logfile=logpath, is_verbose=is_verbose) """Loading pretrained embedding""" if use_pretrained_model: load_embedding(sess, word_to_index, word_embeddings, embeddingpath, embedding_size, vocab_size) # Get a batch with the dataloader and transfrom it into tokens batches = dataloader_train.get_batches(batch_size, num_epochs=num_epochs) batches_eval = dataloader_eval.get_batches(batch_size, num_epochs=num_epochs) for num_batch, batch in enumerate(batches): log("starting batch", num_batch, logfile=logpath, is_verbose=is_verbose) batch = word_to_index_transform(word_to_index, batch) # Defining input and target sequences batch_input, batch_target = batch[:, :-1], batch[:, 1:] # Run the session _, logits, out_loss, computed_perplexity = sess.run(
def main(args): model_class = get_model_class(args.model) model_class.add_config(argparser) args = argparser.parse_args() say(args) args.run_id = random.randint(0, 10**9) args.run_path = "{}/{}".format(args.run_dir, args.run_id) #if not os.path.exists(args.run_dir): # os.makedirs(args.run_dir) #assert os.path.isdir(args.run_dir) #assert not os.path.exists(args.run_path) #os.makedirs(args.run_path) say("\nRun ID: {}\nRun Path: {}\n\n".format(args.run_id, args.run_path)) train_corpus_path = os.path.dirname(args.train) + "/corpus.tsv.gz" train_corpus = Corpus( [tuple([train_corpus_path, os.path.dirname(args.train)])]) valid_corpus_path = os.path.dirname(args.eval) + "/corpus.tsv.gz" valid_corpus = Corpus( [tuple([valid_corpus_path, os.path.dirname(args.eval)])]) say("Corpus loaded.\n") embs = load_embedding(args.embedding) if args.embedding else None embedding_layer = EmbeddingLayer(args.n_d, ['<s>', '</s>'], embs) model = model_class(embedding_layer, args) if args.cuda: model.cuda() say("\n{}\n\n".format(model)) print model.state_dict().keys() needs_grad = lambda x: x.requires_grad optimizer = optim.Adam(filter(needs_grad, model.parameters()), lr=args.lr) if args.load_model: print "Loading pretrained model" model.load_state_dict(torch.load(args.load_model)) else: print "Training will begin from scratch" best_dev = 0 iter_cnt = 0 current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus, args) evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False) for epoch in range(args.max_epoch): iter_cnt = train(iter_cnt, model, train_corpus, args, optimizer) current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus, args) if current_dev > best_dev: best_dev = current_dev evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False) say("\n") if args.save_model: torch.save(model.state_dict(), args.save_model)
test_summary_dir = os.path.join(out_dir, "summaries", timestamp, "test") test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) test_summary_writer.add_graph(sess.graph) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints", timestamp)) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) merged_summary = tf.summary.merge_all() log('summary', logfile=logpath, is_verbose=is_verbose) """Loading pretrained embedding""" if use_pretrained_model: load_embedding(sess, word_to_index, word_embeddings, './wordembeddings.word2vec', embedding_size, vocab_size) # Get a batch with the dataloader and transfrom it into tokens sess.run(tf.global_variables_initializer()) batches = dataloader_train.get_batches(batch_size, num_epochs=num_epochs) batches_eval = dataloader_eval.get_batches(batch_size, num_epochs=num_epochs) for num_batch, batch in enumerate(batches): print(num_batch) log("starting batch", num_batch, logfile=logpath, is_verbose=is_verbose) batch = word_to_index_transform(word_to_index, batch) # Defining input and target sequences
def run_experiment(experiment_type, data_folder, save_model_folder, save_results_folder): """ Runs experiments and saves results Parameters ---------- experiment_type data_folder save_model_folder save_results_folder """ def set_experiment_variables(hidden_state_size=512, down_project_size=None, load_embeddings=False): tf.flags.DEFINE_integer("hidden_state_size", hidden_state_size, "hidden state size (default 512)") tf.flags.DEFINE_integer( "down_project_size", down_project_size, "Down projection size. Should be used with a hidden_state_size of 1024 (default None)" ) tf.flags.DEFINE_boolean( "load_embeddings", load_embeddings, "Whether to use pretrained embeddings or not (default False)") if experiment_type == 'A': set_experiment_variables(512, None, False) elif experiment_type == 'B': set_experiment_variables(512, None, True) elif experiment_type == 'C': set_experiment_variables(1024, 512, True) print("\nExperiment Arguments:") for key in FLAGS.flag_values_dict(): if key == 'f': continue print("{:<22}: {}".format(key.upper(), FLAGS[key].value)) print(" ") data_processing = DataProcessing(FLAGS.sentence_length, FLAGS.max_vocabulary_size) train_corpus = data_processing.preprocess_dataset(data_folder, 'sentences.train') validation_corpus = data_processing.preprocess_dataset( data_folder, 'sentences.eval') test_corpus = data_processing.preprocess_dataset(data_folder, 'sentences_test.txt') continuation_corpus = data_processing.preprocess_dataset( data_folder, 'sentences.continuation', pad_to_sentence_length=False) print(f'Number of train sentences is \t\t{len(train_corpus)}') print(f'Number of validation sentences is \t{len(validation_corpus)}') print(f'Number of test sentences is \t\t{len(test_corpus)}') print(f'Number of continuation sentences is \t{len(continuation_corpus)}') print(" ") best_perplexity = None best_model = None with tf.Graph().as_default(): with tf.Session() as session: # Create a variable to contain a counter for the global training step. global_step = tf.Variable(1, name='global_step', trainable=False) lstm = LSTMCell(FLAGS.embedding_size, FLAGS.hidden_state_size, FLAGS.sentence_length, FLAGS.max_vocabulary_size, down_project_size=FLAGS.down_project_size, pad_symbol=data_processing.vocab['<pad>']) if FLAGS.load_embeddings: load_embedding(session, data_processing.vocab, lstm.input_embeddings, data_folder + '/wordembeddings-dim100.word2vec', FLAGS.embedding_size, len(data_processing.vocab)) #### # Set optimizer and crop all gradients to values [-5, 5] #### with tf.name_scope('train'): optimizer = tf.train.AdamOptimizer() gvs = optimizer.compute_gradients(lstm.loss) capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs] train_step = optimizer.apply_gradients(capped_gvs, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) session.run(tf.global_variables_initializer()) summaries_merged = tf.summary.merge(lstm.summaries) #### # Create checkpoint directory #### timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(save_model_folder, "runs", timestamp)) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) #### # Start training for the specified epochs #### print('Start training...') for epoch in range(FLAGS.num_epochs): for sentences_batch in get_batches( train_corpus, batch_size=FLAGS.batch_size): # run a single step train_batch(sentences_batch, lstm, train_step, global_step, session, summaries_merged) current_step = tf.train.global_step(session, global_step) if current_step % FLAGS.checkpoint_every == 0: perplexities = dev_step( get_batches(validation_corpus, batch_size=FLAGS.batch_size, do_shuffle=False), lstm, global_step, session) average_perplexity = np.mean(perplexities) model_name = "model_experiment-{}_epoch-{}_val-perplexity-{}".format( experiment_type, epoch + 1, average_perplexity) path = saver.save(session, os.path.join(checkpoint_dir, model_name)) print("Saved model checkpoint to {}".format(path)) if best_perplexity is None or best_perplexity > average_perplexity: best_perplexity = average_perplexity best_model = model_name print('Done with epoch', epoch + 1) if best_model is None: raise Exception( "Model has not been saved. Run for at least one epoch") print('Restoring best model', best_model) saver.restore(session, os.path.join(checkpoint_dir, best_model)) # evaluate on test set perplexities = dev_step(get_batches(test_corpus, batch_size=FLAGS.batch_size, do_shuffle=False), lstm, global_step, session, verbose=0) print('Perplexity on test_set is', np.mean(perplexities)) filename = "group25.perplexity{}".format(experiment_type) savefile = os.path.join(save_results_folder, filename) print('Saving results to', savefile) with open(savefile, 'w') as f: f.writelines(str(i) + '\n' for i in perplexities) if experiment_type == 'C': continuation_sentences = continue_sentences( continuation_corpus, session, lstm, data_processing) filename = "group25.continuation" savefile = os.path.join(save_results_folder, filename) print('Saving results to', savefile) with open(savefile, 'w') as f: f.writelines(str(i) + '\n' for i in continuation_sentences) print('Done')
FLAGS.validation_file, FLAGS.test_file) # Build Dictionary print("Build Dictionary...") word2id, id2word, user2id, id2user, poi2id, id2poi, post2id, id2post = utils.build_dic( train, validation, test) # Convert Data to Index print("Converting Data...") train, validation, test, maximum_document_length = utils.converting( train, validation, test, word2id, user2id, poi2id, post2id) # Load pretrained embedding print("Load pretrained word embedding...") _word_embedding = utils.load_embedding(FLAGS.embedding_file, word2id, FLAGS.wordembedding_dim) # Load Visual Feature print("Loading Visual Feature Matrix...") with open(FLAGS.visual_features) as f: _visual_feature = np.load(f)["array"] # Load visual feature print("word dict size: " + str(len(word2id))) print("user dict size: " + str(len(user2id))) print("poi dict size: " + str(len(poi2id))) print("Train/Validation/Test: {:d}/{:d}/{:d}".format(len(train), len(validation), len(test))) print( "=================================================================================="
def test(dataloader, model, device): print(f'{date()}## Start the testing!') start_time = time.perf_counter() test_loss = calculate_mse(model, dataloader, device) end_time = time.perf_counter() print( f"{date()}## Test end, test mse is {test_loss:.6f}, time used {end_time - start_time:.0f} seconds." ) if __name__ == '__main__': config = Config() print(f'{date()}## Load word2vec and data...') word_emb, word_dict = load_embedding(config.word2vec_file) # Train train_dataset = MPCNDataset(config.train_file, word_dict, config) valid_dataset = MPCNDataset(config.valid_file, word_dict, config, retain_rui=False) train_dlr = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) valid_dlr = DataLoader(valid_dataset, batch_size=config.batch_size) os.makedirs(os.path.dirname(config.saved_model), exist_ok=True) # make dir if it isn't exist. MPCN_model = MPCN(config, word_emb, fusion_mode=config.fusion_mode).to(config.device)
def main(args): model_class = get_model_class(args.model) model_class.add_config(argparser) args = argparser.parse_args() args.run_id = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") root_dir = os.path.dirname(os.path.realpath(__file__)) # only use generated run_path if none provided by user if args.run_path is None: args.run_path = os.path.join(root_dir, args.run_dir, args.run_id) if not os.path.exists(args.run_path): os.makedirs(args.run_path) global outputManager outputManager = OutputManager(args.run_path) outputManager.say(args) #if not os.path.exists(args.run_dir): # os.makedirs(args.run_dir) #assert os.path.isdir(args.run_dir) #assert not os.path.exists(args.run_path) #os.makedirs(args.run_path) outputManager.say("\nRun ID: {}\nRun Path: {}\n\n".format( args.run_id, args.run_path)) train_corpus_path = os.path.dirname(args.train) + "/corpus.tsv.gz" train_corpus = Corpus( [tuple([train_corpus_path, os.path.dirname(args.train)])]) valid_corpus_path = os.path.dirname(args.eval) + "/corpus.tsv.gz" valid_corpus = Corpus( [tuple([valid_corpus_path, os.path.dirname(args.eval)])]) outputManager.say("Corpus loaded.\n") embs = load_embedding(args.embedding) if args.embedding else None embedding_layer = EmbeddingLayer(args.n_d, ['<s>', '</s>'], embs) model = model_class(embedding_layer, args) if args.cuda: model.cuda() outputManager.say("\n{}\n\n".format(model)) outputManager.say(model.state_dict().keys()) needs_grad = lambda x: x.requires_grad optimizer = optim.Adam(filter(needs_grad, model.parameters()), lr=args.lr) outputManager.say(optimizer.state_dict()) if args.load_model: outputManager.say("Loading pretrained model") model.load_state_dict(torch.load(args.load_model)) else: outputManager.say("Training will begin from scratch") best_dev = 0 iter_cnt = 0 current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus, args) evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False) for epoch in range(args.max_epoch): iter_cnt = train(iter_cnt, model, train_corpus, args, optimizer) current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus, args) if current_dev > best_dev: best_dev = current_dev evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False) outputManager.say("\n") if args.save_model: torch.save(model.state_dict(), args.save_model) torch.save(model, args.save_model + '-complete')
def main(): train_path = os.path.join(data_path, cfg.train_file) test_path = os.path.join(data_path, cfg.test_file) data_train = pre.load_data(train_path) data_test = pre.load_data(test_path) word_dict, length_voc = pre.build_voc(data_train[0] + data_test[0]) emd_vec_path = os.path.join(data_path, cfg.embedding_file) emd_word_path = os.path.join(data_path, cfg.embedding_vocab) embeddings, vec_dim = pre.load_embedding(emd_vec_path, emd_word_path, word_dict) max_length = max(len(max(data_train[0], key=lambda x: len(x))), len(max(data_test[0], key=lambda x: len(x)))) cfg.length_voc = length_voc cfg.max_length = max_length cfg.sentence_vec_dim = vec_dim cfg.embedding = embeddings train_vec = pre.dataset2id(data_train, word_dict, max_length) test_vec = pre.dataset2id(data_test, word_dict, max_length) train_batch_manager = pre.Manager_batch(train_vec, cfg.batch_size) test_batch_manager = pre.Manager_batch(test_vec, cfg.batch_size) with tf.Graph().as_default(): with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=False): train_model = Model(cfg, is_Training=True) with tf.name_scope("Test"): with tf.variable_scope("Model", reuse=True): valid_model = Model(cfg, is_Training=False) with tf.variable_scope("Model", reuse=True): test_model = Model(cfg, is_Training=False) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True save = tf.train.Supervisor(logdir=cfg.save_path, global_step=train_model.global_steps) verbose = False with save.managed_session(config=tf_config) as sess: logging.info("training.....") best_score = 0 best_f1 = 0 if cfg.train: for epoch in range(cfg.num_epoches): train_iter = train_batch_manager.iter_batch(shuffle=True) test_iter = test_batch_manager.iter_batch(shuffle=False) run_epoch(sess, train_model, train_iter, is_training=True, verbose=verbose) test_acc, f1 = run_epoch(sess, valid_model, test_iter, is_training=False, verbose=verbose) if test_acc > best_score: best_score = test_acc best_f1 = f1 if cfg.save_path: save.saver.save(sess, cfg.save_path, global_step=save.global_step) #logging.info('') logging.info( "\033[1;31;40mEpoch: %d Test: accuracy %.2f%% " % (epoch + 1, test_acc * 100)) print("f1:", f1) logging.info("\033[0m") logging.info("\033[1;31;40mThe best accuracy score is %.2f%%" % (best_score * 100)) print("best f1: ", best_f1) if cfg.test: ckpt = tf.train.get_checkpoint_state(cfg.save_path) save.saver.restore(sess, ckpt.model_checkpoint_path) test_iter = test_batch_manager.iter_batch(shuffle=False) test_acc = evaluation(sess, test_model, test_iter) print('accuracy: %.2f%%' % (test_acc * 100))
def __init__(self, prefix="./dataset/EHR", mode="sds"): test_filepath = os.path.join(prefix, "test/data_moresymp.txt") assert mode in ["sus", "sds", "mix", "pmi", "gpmi"] self.mode = mode # maps path if mode in ["sds", "mix"]: self.dise2symp = np.load(os.path.join(prefix, "dise2symp.npy"), allow_pickle=True).item() self.symp2dise = np.load(os.path.join(prefix, "symp2dise.npy"), allow_pickle=True).item() if mode in ["sus", "mix"]: self.user2symp = np.load(os.path.join(prefix, "user2symp.npy"), allow_pickle=True).item() self.symp2user = np.load(os.path.join(prefix, "symp2user.npy"), allow_pickle=True).item() # load embeddings self.symp_embs, self.dise_embs = load_embedding("ckpt/GNN.pt") self.num_symp = self.symp_embs.shape[0] if mode in ["pmi", "gpmi"]: # init a PMI matrix that has shape M X M # we'd better make it a sparse matrix # if we pick the graphPMI (gpmi) method, we need an additional S-D PMI matrix. # read data self.pmi_ss_path = os.path.join(prefix, "pmi_ss_mat.npz") self.pmi_sd_path = os.path.join(prefix, "pmi_sd_mat.npz") self.symp_count_path = os.path.join(prefix, "sympcount.npy") self.dise_count_path = os.path.join(prefix, "disecount.npy") self.dise2symp_path = os.path.join(prefix, "dise2symp.npy") if os.path.exists(self.pmi_ss_path): print("Load PMI Mat from", self.pmi_ss_path) self.symp2symp = sparse.load_npz(self.pmi_ss_path) # self.symp2dise = sparse.load_npz(self.pmi_sd_path) self.symp2dise = np.load(os.path.join(prefix, "symp2dise.npy"), allow_pickle=True).item() self.sympcount = np.load(self.symp_count_path, allow_pickle=True).item() self.disecount = np.load(self.dise_count_path, allow_pickle=True).item() self.symp2symp.setdiag(0) else: print("Build PMI Mat.") self.build_pmi_matrix(prefix) self.symp2symp.setdiag(0) # build symp count array c_ar, i_ar = [], [] for k, v in self.sympcount.items(): c_ar.append(v) i_ar.append(int(k)) sympcount_mat = sparse.csr_matrix((c_ar, (i_ar, [0] * len(i_ar)))) self.sympcount_ar = sympcount_mat.toarray().flatten() self.num_all_symp = self.sympcount_ar.sum() # build dise count array c_ar, i_ar = [], [] for k, v in self.disecount.items(): c_ar.append(v) i_ar.append(int(k)) disecount_mat = sparse.csr_matrix((c_ar, (i_ar, [0] * len(i_ar)))) self.disecount_ar = disecount_mat.toarray().flatten() self.num_all_dise = self.disecount_ar.sum() self.dise2symp = np.load(self.dise2symp_path, allow_pickle=True).item()
parser.add_argument('--nflips', type=int, default=0, help='number of flips') parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001') parser.add_argument('--warm-start', action='store_true') args = parser.parse_args() batch_size = args.batch_size # set sample sizes nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size # num training samples nb_val_samples = nb_train_samples # num validation samples # seed weight initialization random.seed(seed) np.random.seed(seed) embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words) vocab_size, embedding_size = embedding.shape oov0 = vocab_size - nb_unknown_words idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words) X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed) # print a sample recipe to make sure everything looks right print('Random head, description:') i = 811 prt('H', Y_train[i], idx2word) prt('D', X_train[i], idx2word) # save model initialization parameters model_params = (dict( vocab_size=vocab_size, embedding_size=embedding_size,
def train(): ''' Train function ''' args = get_args() # Load data dataset = SemEvalDataset(args.train_filename, max_len=args.seq_len) dataloader = DataLoader(dataset, args.batch_size, True, num_workers=args.num_workers) dataset_val = SemEvalDataset(args.test_filename, max_len=args.seq_len, d=(dataset.d, dataset.rel_d)) dataloader_val = DataLoader(dataset_val, args.batch_size, True, num_workers=args.num_workers) args.word_embedding = load_embedding(args.embedding_filename, args.embedding_wordlist_filename, dataset.d) args.vac_len_word = len(dataset.d.word2id) args.vac_len_rel = len(dataset.rel_d.word2id) args.dw = args.word_embedding.shape[1] for arg in vars(args): print("{} = {}".format(arg, getattr(args, arg))) # Build models writer = SummaryWriter() model = CNN(args) loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) best_eval_acc = 0. best_eval_f1 = 0. for i in range(args.nepoch): # Training total_loss = 0. total_acc = 0. total_f1 = 0. ntrain_batch = 0 model.train() for (seq, e1, e2, dist1, dist2, r) in dataloader: ntrain_batch += 1 seq = Variable(seq) e1 = Variable(e1) e2 = Variable(e2) dist1 = Variable(dist1) dist2 = Variable(dist2) r = Variable(r) r = r.view(r.size(0)) pred = model(seq, dist1, dist2, e1, e2) l = loss_func(pred, r) acc = accuracy(pred, r) f1 = F1(pred, r) total_acc += acc total_f1 += f1 total_loss += l.item() optimizer.zero_grad() l.backward() optimizer.step() writer.add_scalar('train/loss', l.item(), i) writer.add_scalar('train/accuracy', total_acc / ntrain_batch, i) writer.add_scalar('train/f1', total_f1 / ntrain_batch, i) print("Epoch: {}, Training loss : {:.4}, acc: {:.4}, f1: {:.4}".format( i, total_loss / ntrain_batch, total_acc / ntrain_batch, total_f1 / ntrain_batch)) # Evaluation if i % args.eval_every == args.eval_every - 1: val_total_acc = 0. val_total_f1 = 0. nval_batch = 0 model.eval() for (seq, e1, e2, dist1, dist2, r) in dataloader_val: nval_batch += 1 seq = Variable(seq) e1 = Variable(e1) e2 = Variable(e2) dist1 = Variable(dist1) dist2 = Variable(dist2) r = Variable(r) r = r.view(r.size(0)) pred = model(seq, dist1, dist2, e1, e2) acc = accuracy(pred, r) f1 = F1(pred, r) val_total_acc += acc val_total_f1 += f1 best_eval_acc = max(best_eval_acc, val_total_acc / nval_batch) # Write the stats to tensorboard writer.add_scalar('test/accuracy', val_total_acc / nval_batch, i) writer.add_scalar('test/F1', val_total_f1 / nval_batch, i) print("Epoch: {}, Val acc: {:.4f}, F1: {:.4f}".format( i, val_total_acc / nval_batch, val_total_f1 / nval_batch)) print('Best acc: {}'.format(best_eval_acc)) print('Best F1: {}'.format(best_eval_f1)) torch.save(model.state_dict(), args.model_file) writer.close()
else: torch.set_default_tensor_type(torch.FloatTensor) torch.set_printoptions(precision=9) torch.set_num_threads(1) # Load command line options options = vars(args) writer.add_text('Text', 'Hyper-parameters: {}'.format(options), 0) # Load supervision pairs and convert to dict f_supervision = options["supervision_file"] train_hyper2hypo, train_hypo2hyper = load_directional_supervision(f_supervision) # Load embedding files and word <-> index map f_embed = options["embed_file"] embedding, index2word, word2index, vocab_size, embed_dim = load_embedding(f_embed) print("=== Finish loading embedding ===") options["embedding"] = embedding options["index2word"] = index2word options["word2index"] = word2index options["vocabSize"] = vocab_size options["embedSize"] = embed_dim # Construct training set and training data loader if options["use_pair_feature"]: print("!!! Using pair features") f_pair_feature_key = options["pair_feature_prefix"] + "edge.keys3.tsv" f_pair_feature_value = options["pair_feature_prefix"] + "edge.values3.scaled.npy" pair_features = load_pair_features(f_pair_feature_key, f_pair_feature_value) train_data = DirectionalTripletsWithPairFeature(options["embedding"], train_hyper2hypo, pair_features) else:
def main(config, training_set, testing_set): training_set.set_preprocess_fn(scheduler_preprocess) training_set.set_special_tokens(['<pad>', '<unk>']) testing_set.set_preprocess_fn(scheduler_preprocess) testing_set.set_special_tokens(['<pad>', '<unk>']) scheduler_model = VanillaSeq2SeqEncoder(config.batch_size, config.vocab_size, config.embedding_size, config.hidden_size) _ = scheduler_model() scheduler_model.optimize(config.learning_rate) tf.summary.scalar("cost", scheduler_model.mse) nthreads_intra = config.nthreads // 2 nthreads_inter = config.nthreads - config.nthreads // 2 with tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=nthreads_inter, intra_op_parallelism_threads=nthreads_intra)) as sess: timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") writer = tf.summary.FileWriter('./logs/' + timestamp + '/train/', sess.graph) test_writer = tf.summary.FileWriter('./logs/' + timestamp + '/test/', sess.graph) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Load word2vec pretrained embeddings load_embedding(sess, training_set.word_to_index, scheduler_model.word_embeddings, config.embedding_path, config.embedding_size, config.vocab_size) for epoch in range(config.n_epochs): if not epoch % config.test_every: # Testing phase success = 0 total = 0 for k in range(0, len(testing_set), config.batch_size): if k + config.batch_size < len(testing_set): batch_endings1, batch_endings2, correct_ending = testing_set.get(k, config.batch_size, random=True) total += config.batch_size shuffled_batch1, labels1 = scheduler_get_labels(batch_endings1) shuffled_batch2, labels2 = scheduler_get_labels(batch_endings2) probabilities1 = sess.run( 'scheduler/order_probability:0', {'scheduler/x:0': shuffled_batch1, 'scheduler/optimize/label:0': labels1}) probabilities2 = sess.run( 'scheduler/order_probability:0', {'scheduler/x:0': shuffled_batch2, 'scheduler/optimize/label:0': labels2}) for b in range(config.batch_size): if probabilities1[b][np.where(labels1[b] == 1)[0][0]] > probabilities2[b][np.where(labels2[b] == 1)[0][0]]: if correct_ending[b] == 0: success += 1 else: if correct_ending[b] == 1: success += 1 accuracy = float(success) / float(total) accuracy_summary = tf.Summary() accuracy_summary.value.add(tag='accuracy', simple_value=accuracy) test_writer.add_summary(accuracy_summary, epoch) for k in range(0, len(training_set), config.batch_size): if k + config.batch_size < len(training_set): summary_op = tf.summary.merge_all() batch = training_set.get(k, config.batch_size, random=True) shuffled_batch, labels = scheduler_get_labels(batch) probabilities, _, computed_mse, summary = sess.run( ['scheduler/order_probability:0', 'scheduler/optimize/optimizer', 'scheduler/optimize/mse:0', summary_op], {'scheduler/x:0': shuffled_batch, 'scheduler/optimize/label:0': labels}) writer.add_summary(summary, epoch * len(training_set) + k) if not epoch % config.save_model_every: model_path = './builds/' + timestamp saver.save(sess, model_path, global_step=epoch) training_set.shuffle_lines() if not epoch % config.save_model_every: model_path = './builds/' + timestamp + '/model' saver.save(sess, model_path, global_step=epoch)
vec_size_alpha = 300 runs = 2 np.random.seed(42) seeds = np.random.permutation(list(range(runs))) reference_p = ['0', 'u'] plot = 0 #alphas = np.concatenate([np.arange(-10,-3,0.5),np.arange(-3, 3, 0.1), np.arange(3,10,0.5)]) stats = 0 ############################################# if plot == 0 and stats == 0: D_glove, V_glove, D_pretrained_glove, V_pretrained_glove = load_embedding() # https://gitlab.com/praj88/deepsentiment/blob/master/train_code/utility_functions.py # Combine and split the data into train and test def read_senti(path): # read dictionary into df df_data_sentence = pd.read_table(path + 'dictionary.txt') df_data_sentence_processed = df_data_sentence['Phrase|Index'].str.split( '|', expand=True) df_data_sentence_processed = df_data_sentence_processed.rename( columns={ 0: 'Phrase', 1: 'phrase_ids' })