def main(): start = time.time() query = sys.argv[1] glove = utils.load_glove() quest = utils.init_babi_deploy( os.path.join( os.path.join( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data'), 'corpus'), 'babi.txt'), query) dmn = dmn_basic.DMN_basic(babi_train_raw=quest, babi_test_raw=[], word2vec=glove, word_vector_size=50, dim=40, mode='deploy', answer_module='feedforward', input_mask_mode="sentence", memory_hops=5, l2=0, normalize_attention=False, answer_vec='index', debug=False) dmn.load_state( 'states/dmn_basic.mh5.n40.bs10.babi1.epoch2.test1.20454.state') prediction = dmn.step_deploy() prediction = prediction[0][0] for ind in prediction.argsort()[::-1]: if ind < dmn.answer_size: print(dmn.ivocab[ind]) break print('Time taken:', time.time() - start)
def init_config(task_id, restore=None, strong_supervision=None, l2_loss=None, num_runs=None): global config, word2vec if config.word2vec_init: if not word2vec: word2vec = utils.load_glove() else: word2vec = {} # config.strong_supervision = True config.l2 = l2_loss if l2_loss is not None else 0.001 config.strong_supervision = strong_supervision if strong_supervision is not None else False num_runs = num_runs if num_runs is not None else '1' if task_id is not None: if ',' in task_id: tn = get_task_num(task_id.split(','), num_runs.split(',')) loop_model(tn, restore) elif '-' in task_id: st_en = task_id.split('-') if len(st_en) < 2: raise ValueError( "task id should be the forms of x,y,z,t or x-y or x") st = st_en[0] en = st_en[-1] tn = get_task_num(np.arange(st, en), num_runs.split(',')) loop_model(tn, restore) else: config.task_id = task_id run_model(config, word2vec, int(num_runs[0]), restore)
def get_data(vocabs=""): print("==> Load Word Embedding") word_embedding = utils.load_glove(use_index=True) validation_data = [] training_data = [] if not vocabs: non_words = utils.load_file(p.non_word, False) for w in non_words: w_ = w.replace('\n', '').split(' ') validation_data.append(int(w_[-1])) training_data = utils.sub(range(len(word_embedding)), validation_data) else: vocabs_set = utils.load_file(vocabs) print("vc", len(vocabs_set)) training_data = [w for _, w in vocabs_set.iteritems()] tm = range(len(word_embedding)) validation_data = list(utils.sub(set(tm), set(training_data))) length = int(math.ceil(len(training_data) * 1.0 / p.compression_batch_size)) * p.compression_batch_size - len(training_data) print('before', 'vd', len(validation_data), 'td', len(training_data)) if length: add_on = np.random.choice(validation_data, length) training_data += add_on.tolist() validation_data = utils.sub(set(validation_data), set(add_on)) print('vd', len(validation_data), 'td', len(training_data)) # utils.save_file(p.glove_path, training_data) return word_embedding, training_data, validation_data
def build_word_mappings(x_train, nlp, glove_dir): """Generate word to count, word to index, and word to vector mappings.""" # Map each token to the # of times it appears in the corpus. tokens = [ item for t in nlp(' '.join(x_train.values), disable=['parser', 'tagger', 'ner']) for item in [t.text.strip()] if item ] w2count = dict(filter(lambda x: x[1] > 4, Counter(tokens).items())) save_pickle(tokens, 'tokens') save_pickle(w2count, 'w2count') # Construct w2idx dict and i2w list. w2idx = { k: i for i, (k, v) in enumerate( sorted(w2count.items(), key=lambda x: x[1], reverse=True), 2) } w2idx['<PAD>'] = 0 w2idx['<UNK>'] = 1 i2w = [k for k, v in sorted(w2idx.items(), key=lambda x: x[1])] save_pickle(w2idx, 'w2idx') save_pickle(i2w, 'i2w') # Load word vectors and filter to include words in our vocab. w2vec = load_glove(300, glove_dir) w2vec = {k: v for k, v in w2vec.items() if k in w2idx} save_pickle(w2vec, 'w2vec')
def build_embedding(): vocabs = utils.load_file(vocabs_path, use_pickle=False) word_embedding = utils.load_glove() embedding = [] for w in vocabs: w = w.replace('\n', '') if w in word_embedding: embedding.append(word_embedding[w]) utils.save_file(embedding_path, embedding)
def build_word_vocabulary(self, text_keys, word_count_threshold=0): """ borrowed this implementation from @karpathy's neuraltalk. """ print("Building word vocabulary starts.\n") all_sentences = [] for k in text_keys: all_sentences.extend(self.raw_train[k]) word_counts = {} for sentence in all_sentences: for w in self.line_to_words(sentence, eos=False, downcase=True): word_counts[w] = word_counts.get(w, 0) + 1 # vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] vocab = [ w for w in word_counts if word_counts[w] >= word_count_threshold and w not in self.word2idx.keys() ] print( "Vocabulary Size %d (<pad> <unk> <eos> excluded) using word_count_threshold %d.\n" % (len(vocab), word_count_threshold)) # build index and vocabularies for idx, w in enumerate(vocab): self.word2idx[w] = idx + self.offset self.idx2word[idx + self.offset] = w print("word2idx size: %d, idx2word size: %d.\n" % (len(self.word2idx), len(self.idx2word))) # Make glove embedding. print("Loading glove embedding at path : %s.\n" % self.glove_embedding_path) glove_full = load_glove(self.glove_embedding_path) print("Glove Loaded, building word2idx, idx2word mapping.\n") glove_matrix = np.zeros([len(self.idx2word), self.embedding_dim]) glove_keys = glove_full.keys() for i in tqdm(range(len(self.idx2word))): w = self.idx2word[i] w_embed = glove_full[w] if w in glove_keys else np.random.randn( self.embedding_dim) * 0.4 glove_matrix[i, :] = w_embed self.vocab_embedding = glove_matrix print("vocab embedding size is :", glove_matrix.shape) print("Saving cache files at ./cache.\n") if not os.path.exists("./cache"): os.makedirs("./cache") pickle.dump(self.word2idx, open(self.word2idx_path, 'w')) pickle.dump(self.idx2word, open(self.idx2word_path, 'w')) pickle.dump(glove_matrix, open(self.vocab_embedding_path, 'w')) print("Building vocabulary done.\n")
def main(): glove = load_glove() vector = [] with open("data/corpus/cricket.txt", 'r') as f: for line in f: line = line.strip() # print(line) l = get_word_vecs(line, glove) measure = centroid(l) vector.append((line, measure)) query = sys.argv[1] query_measure = centroid(get_word_vecs(query, glove)) print(get_most_relevant(vector, query_measure))
def _init_embeddings(self): self.user_matrix = tf.get_variable(name='user_matrix', shape=[self.total_users, self.F], initializer=self.weight_initializer, dtype=tf.float32) self.item_matrix = tf.get_variable(name='item_matrix', shape=[self.total_items, self.F], initializer=self.weight_initializer, dtype=tf.float32) self.word_matrix = tf.get_variable(name='word_matrix', shape=[self.V, self.W], initializer=tf.constant_initializer( load_glove(self.V, self.W)), dtype=tf.float32)
def mi_mlps_ptb(args): # load data s_train, p_train = load_data('penn_treebank_dataset', 'train') s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') s_test, p_test = load_data('penn_treebank_dataset', 'test') sentences = s_train + s_dev + s_test parsed = p_train + p_dev + p_test doc_id, sen_id, global_graph = construct_graph(parsed) s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], [] # load embeddings graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) bert_emb = load_glove(args, sentences) # bert_emb = load_elmo(args, sentences) # bert_emb_paths = bert_embeddings(args, sentences) # bert_emb = np.load(bert_emb_paths[0], allow_pickle=True) # initialize mi mir, mig, mib = [], [], [] for l in range(args.bert_layers_num): mib.append([]) for s in range(len(sentences)): mir.append(0.) mig.append(0.) for l in range(args.bert_layers_num): mib[l].append(0.) if args.baselines: print('3.1 start to calculate baselines of MI...') # calculate MI baselines for r in range(args.repeat): tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower') tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper') # get sum value mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))] mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))] print('3.2 start to calculate BERT hidden states of MI...') for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), args.bert_layers_num - 1) mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))] mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat) print('MI(G, Glove): {} |'.format(mib_layers))
def init_config(task_id): global config, word2vec, model if config.word2vec_init: if not word2vec: word2vec = utils.load_glove() else: word2vec = {} config.batch_size = 10 config.strong_supervision = False # config.train_mode = False config.task_id = task_id if config.reset: tf.reset_default_graph() if model is None: model = Model(config, word2vec) else: model.config = config model.init_global() config.reset = True main(model)
def train(): with tf.device('/cpu:0'): x_text, y = data_helpers.load_data_and_labels(FLAGS.train_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print("Text Vocabulary Size: {:d}".format(len( vocab_processor.vocabulary_))) print("x = {0}".format(x.shape)) print("y = {0}".format(y.shape)) print("") # Randomly shuffle data to split into train and test(dev) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): model = AttLSTM(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, hidden_size=FLAGS.hidden_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(model.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Pre-trained word2vec if FLAGS.embedding_path: pretrain_W = utils.load_glove(FLAGS.embedding_path, FLAGS.embedding_dim, vocab_processor) sess.run(model.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for batch in batches: x_batch, y_batch = zip(*batch) # Train feed_dict = { model.input_text: x_batch, model.input_y: y_batch, model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob, model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob, model.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") feed_dict = { model.input_text: x_dev, model.input_y: y_dev, model.emb_dropout_keep_prob: 1.0, model.rnn_dropout_keep_prob: 1.0, model.dropout_keep_prob: 1.0 } summaries, loss, accuracy, predictions = sess.run([ dev_summary_op, model.loss, model.accuracy, model.predictions ], feed_dict) dev_summary_writer.add_summary(summaries, step) time_str = datetime.datetime.now().isoformat() f1 = f1_score(np.argmax(y_dev, axis=1), predictions, labels=np.array(range(1, 19)), average="macro") print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) print( "[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n" .format(f1)) # Model checkpoint if best_f1 < f1: best_f1 = f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
##necessary imports.. import tensorflow as tf from utils import spacy_cleaner from utils import load_glove from utils import build_vocab from utils import check_coverage import pandas as pd import nunpy as np import matplotlib.pyplot as plt ##loading the dataset.. train = pd.read_csv("sentiment_analysis/train.csv") test = pd.read_csv("sentiment_analysis/test.csv") ##Loading the glove vectors... embedding_index = load_glove('sentiment_analysis/glove.6B.100d.txt') ##building vocab train['clean_text'] = [spacy_cleaner(t) for t in train.tweet] sentences = train['clean_text'].map(lambda z: z.split()) vocab_step1 = build_vocab(sentences) ##checking the coverage.. oov = check_coverage(vocab_step1, embedding_index) ##Inspection... print(oov[:20]) ##this shows top-20 out of vocabulary words which are to be modified.. ##Modification is done to make full use of word_embeddings
def train(): vocab = read_vocab(FLAGS.vocab_data) glove = load_glove("data/glove.6B.{}d.txt".format(FLAGS.emb_size), FLAGS.emb_size, vocab) train = Dataset(filepath=FLAGS.train_data) valid = Dataset(filepath=FLAGS.valid_data) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): dmn = DyMemNet(hid_size=FLAGS.hid_size, vocab_size=len(vocab), emb_size=FLAGS.emb_size, num_classes=FLAGS.num_classes, num_hops=FLAGS.num_hops, pretrained_embs=glove, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) grads_and_vars = optimizer.compute_gradients(dmn.loss) capped_grads_and_vars = [ (tf.clip_by_norm(grad, FLAGS.max_grad_norm), var) for grad, var in grads_and_vars ] train_op = optimizer.apply_gradients(capped_grads_and_vars, global_step=global_step) acc, acc_op = tf.metrics.accuracy(labels=dmn.labels, predictions=dmn.predictions, name="metrics/acc") metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_init_op = tf.variables_initializer(var_list=metrics_vars) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", dmn.loss) acc_summary = tf.summary.scalar("accuracy", dmn.accuracy) # Train summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Valid summaries valid_step = 0 valid_summary_op = tf.summary.merge([loss_summary, acc_summary]) valid_summary_dir = os.path.join(out_dir, "summaries", "valid") valid_summary_writer = tf.summary.FileWriter( valid_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # initialize all variables best_valid_acc = 0.0 sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # training and validating loop for epoch in range(FLAGS.num_epochs): print('-' * 100) print('\n{}> epoch: {}\n'.format( datetime.datetime.now().isoformat(), epoch)) sess.run(metrics_init_op) # Training process for batch in train.bacth_iter(FLAGS.batch_size, desc="Training", shuffle=True): labels, contexts, queries = zip(*batch) contexts, num_sents = normalize(contexts) feed_dict = { dmn.context_placeholder: contexts, dmn.query_placeholder: queries, dmn.num_sents: num_sents, dmn.labels: labels, dmn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy, _ = sess.run([ train_op, global_step, train_summary_op, dmn.loss, dmn.accuracy, acc_op ], feed_dict) train_summary_writer.add_summary(summaries, step) print("\ntraining accuracy = {:.2f}\n".format( sess.run(acc) * 100)) sess.run(metrics_init_op) # Validating process for batch in valid.bacth_iter(FLAGS.batch_size, desc="Validating", shuffle=False): valid_step += 1 labels, contexts, queries = zip(*batch) contexts, num_sents = normalize(contexts) feed_dict = { dmn.context_placeholder: contexts, dmn.query_placeholder: queries, dmn.num_sents: num_sents, dmn.labels: labels, dmn.dropout_keep_prob: 1.0 } summaries, loss, accuracy, _ = sess.run( [valid_summary_op, dmn.loss, dmn.accuracy, acc_op], feed_dict) valid_summary_writer.add_summary(summaries, global_step=valid_step) valid_acc = sess.run(acc) * 100 print("\nvalidating accuracy = {:.2f}\n".format(valid_acc)) print("previous best validating accuracy = {:.2f}\n".format( best_valid_acc)) # model checkpoint if valid_acc > best_valid_acc: best_valid_acc = valid_acc path = saver.save(sess, checkpoint_prefix) print("saved model checkpoint to {}\n".format(path)) print("{} optimization finished!\n".format( datetime.datetime.now())) print("best validating accuracy = {:.2f}\n".format(best_valid_acc))
def main(): print("Loading wordvecs...") if utils.exists("glove", "glove.840B.300d.txt", "gutenberg"): words, wordvecs = utils.load_glove("glove", "glove.840B.300d.txt", "gutenberg") else: words, wordvecs = utils.load_glove("glove", "glove.840B.300d.txt", "gutenberg", set(map(clean_word, gutenberg.words()))) wordvecs_norm = wordvecs / np.linalg.norm(wordvecs, axis=1).reshape(-1, 1) print("Loading corpus...") # Convert corpus into normed wordvecs, replacing any words not in vocab with zero vector sentences = [[wordvecs_norm[words[clean_word(word)]] if clean_word(word) in words.keys() else np.zeros(WORD_DIM) for word in sentence] for sentence in gutenberg.sents()] print("Processing corpus...") # Pad sentences shorter than SEQUENCE_LENGTH with zero vectors and truncate sentences longer than SEQUENCE_LENGTH s_train = list(map(pad_or_truncate, sentences)) np.random.shuffle(s_train) # Truncate to multiple of BATCH_SIZE s_train = s_train[:int(len(s_train) / BATCH_SIZE) * BATCH_SIZE] s_train_idxs = np.arange(len(s_train)) print("Generating graph...") network = NlpGan(learning_rate=LEARNING_RATE, d_dim_state=D_DIM_STATE, g_dim_state=G_DIM_STATE, dim_in=WORD_DIM, sequence_length=SEQUENCE_LENGTH) plotter = Plotter([2, 1], "Loss", "Accuracy") plotter.plot(0, 0, 0, 0) plotter.plot(0, 0, 0, 1) plotter.plot(0, 0, 1, 0) plotter.plot(0, 1, 1, 0) #d_vars = [var for var in tf.trainable_variables() if 'discriminator' in var.name] saver = tf.train.Saver() with tf.Session() as sess: #eval(sess, network, words, wordvecs_norm, saver) sess.run(tf.global_variables_initializer()) #resume(sess, saver, plotter, "GAN_9_SEQUENCELENGTH_10", 59) d_loss, g_loss = 0.0, 0.0 for epoch in range(0, 10000000): print("Epoch %d" % epoch) np.random.shuffle(s_train_idxs) for batch in range(int(len(s_train_idxs) / BATCH_SIZE)): # select next random batch of sentences s_batch_real = [s_train[x] for x in s_train_idxs[batch:batch + BATCH_SIZE]] # shape (BATCH_SIZE, SEQUENCE_LENGTH, WORD_DIM) # reshape to (SEQUENCE_LENGTH, BATCH_SIZE, WORD_DIM) while preserving sentence order s_batch_real = np.array(s_batch_real).swapaxes(0, 1) if d_loss - g_loss > MAX_LOSS_DIFF and False: output_dict = sess.run( network.get_fetch_dict('d_loss', 'd_train', 'g_loss'), network.get_feed_dict(inputs=s_batch_real, input_dropout=D_KEEP_PROB) ) elif g_loss - d_loss > MAX_LOSS_DIFF and False: output_dict = sess.run( network.get_fetch_dict('d_loss', 'g_loss', 'g_train'), network.get_feed_dict(inputs=s_batch_real, input_dropout=D_KEEP_PROB) ) else: output_dict = sess.run( network.get_fetch_dict('d_loss', 'd_train', 'g_loss', 'g_train'), network.get_feed_dict(inputs=s_batch_real, input_dropout=D_KEEP_PROB, instance_variance=INSTANCE_VARIANCE) ) d_loss, g_loss = output_dict['d_loss'], output_dict['g_loss'] if batch % 10 == 0: print("Finished training batch %d / %d" % (batch, int(len(s_train) / BATCH_SIZE))) print("Discriminator Loss: %f" % output_dict['d_loss']) print("Generator Loss: %f" % output_dict['g_loss']) plotter.plot(epoch + (batch / int(len(s_train) / BATCH_SIZE)), d_loss, 0, 0) plotter.plot(epoch + (batch / int(len(s_train) / BATCH_SIZE)), g_loss, 0, 1) if batch % 100 == 0: eval = sess.run( network.get_fetch_dict('g_outputs', 'd_accuracy'), network.get_feed_dict(inputs=s_batch_real, input_dropout=1.0, instance_variance=INSTANCE_VARIANCE) ) # reshape g_outputs to (BATCH_SIZE, SEQUENCE_LENGTH, WORD_DIM) while preserving sentence order generated = eval['g_outputs'].swapaxes(0, 1) for sentence in generated[:3]: for wordvec in sentence: norm = np.linalg.norm(wordvec) word, similarity = nearest_neighbor(words, wordvecs_norm, wordvec / norm) print("{}({:4.2f})".format(word, similarity)) print('\n---------') print("Total Accuracy: %f" % eval['d_accuracy']) plotter.plot(epoch + (batch / int(len(s_train) / BATCH_SIZE)), eval['d_accuracy'], 1, 0) saver.save(sess, './checkpoints/{}.ckpt'.format(SAVE_NAME), global_step=epoch) plotter.save(SAVE_NAME)
def main(restore=False): word_embedding = None word_embedding_file = 'squad/word_embedding.pkl' if u.check_file(word_embedding_file): word_embedding = utils.load_file(word_embedding_file) else: print("==> Load vectors ...") vocabs = None ft_vc = 'squad/vocabs_fine_tuned.pkl' if u.check_file(ft_vc): vocabs = u.load_file(ft_vc) else: raise ValueError("Please check vocabs fine-tuned file") word2vec = utils.load_glove() word_embedding = np.zeros((len(vocabs), p.embed_size)) for index, v in enumerate(vocabs): if v in word2vec: word_embedding[index] = word2vec[v] else: word_embedding[index] = pr.create_vector( v, word2vec, p.embed_size) del word2vec utils.save_file('squad/word_embedding.pkl', word_embedding) print("==> Done vectors ") # init word embedding contexts, contexts_len, questions, questions_len, answers, answers_len, start, end = utils.load_file( 'squad/doc_train_idx.pkl') data_len = int(np.floor(0.9 * len(contexts))) train = contexts[:data_len], contexts_len[:data_len], questions[:data_len], \ questions_len[:data_len], answers[:data_len], answers_len[:data_len], \ start[:data_len], end[:data_len] dev = contexts[data_len:], contexts_len[data_len:], questions[data_len:], \ questions_len[data_len:], answers[data_len:], answers_len[data_len:], \ start[data_len:], end[data_len:] config = Config() config.strong_supervision = True model = ModelSquad(config) model.set_data(train, dev, word_embedding, np.shape(questions)[1], np.shape(contexts)[1], np.shape(answers)[1], len(word_embedding)) model.set_encoding() model.init_ops() # tf.reset_default_graph() print('Start training DMN on squad') # model.init_data_node() best_overall_val_loss = float('inf') # create model tconfig = tf.ConfigProto(allow_soft_placement=True) with tf.device('/%s' % p.device): print('==> initializing variables') init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config=tconfig) as session: sum_dir = 'summaries/train_squad/' + time.strftime("%Y-%m-%d %H %M") if not utils.check_file(sum_dir): os.makedirs(sum_dir) train_writer = tf.summary.FileWriter(sum_dir, session.graph) session.run(init) if restore: print('==> restoring weights') saver.restore(session, 'weights/squad.weights') print('==> starting test') start = time.time() valid_loss, valid_accuracy = model.run_epoch(session, model.valid) print('Validation loss: {}'.format(valid_loss)) print('Validation accuracy: {}'.format(valid_accuracy)) print('Total time: {}'.format(time.time() - start))
def _define_global(glove_file): global glove glove = load_glove(glove_file, verbose=1)
def make_data(): """data pre-processing""" global SD # load data print('loading data: Multi-Domain Sentiment Dataset v2') texts, s_labels, d_labels = load_mdsd(domains=DOMAINS) # build vocabulary for words print('building vocabulary') texts_tokens = [] lens = [] for text in texts: words = word_tokenize(text) for idx, word in enumerate(words): if word.isdigit(): words[idx] = '<NUM>' # replace number token with <NUM> texts_tokens.append(words) lens.append(len(words)) maxlen = int(np.percentile(lens, 95)) print('maxlen:', maxlen) counter = Counter() for words in texts_tokens: counter.update(words) word2index = {'<PAD>': 0, '<UNK>': 1} for idx, word_count in enumerate(counter.most_common(SD.max_words)): if word_count[1] >= SD.min_count: # min_count word2index[word_count[ 0]] = idx + 2 # starting from 2, 0 used as <PAD>, 1 used as <OOV> n_words = len(word2index) print('n_words:', n_words) # data encode print('data encoding') seqs = [] for words in texts_tokens: seqs.append([word2index.get(word, 1) for word in words]) seqs_padded = pad_sequences(seqs, maxlen=maxlen, padding='post', truncating='post') s_labels = np.asarray(s_labels, dtype=int) d_labels = np.asarray(d_labels, dtype=int) # domain & train/val/test split print('labeled data: domain & train/val/test splitting') X_train, ys_train, yd_train = [], [], [] X_val, ys_val, yd_val = [], [], [] X_test_byd, ys_test_byd, yd_test_byd = {}, {}, {} for d_id, d_name in enumerate(DOMAINS): print(d_name, 'splitting') seqs_padded_ofd = seqs_padded[(d_labels == d_id) & (s_labels != -1)] slabels_ofd = s_labels[(d_labels == d_id) & (s_labels != -1)] print(' * all:', seqs_padded_ofd.shape, slabels_ofd.shape) (X_train_ofd, X_val_ofd, X_test_ofd), (y_train_ofd, y_val_ofd, y_test_ofd) = _tvt_split(seqs_padded_ofd, slabels_ofd) # train data (add this domain) X_train.extend(X_train_ofd) ys_train.extend(y_train_ofd) yd_train.extend([d_id] * len(X_train_ofd)) # val data X_val.extend(X_val_ofd) ys_val.extend(y_val_ofd) yd_val.extend([d_id] * len(X_val_ofd)) # test data X_test_byd[d_id] = X_test_ofd ys_test_byd[d_id] = to_categorical(y_test_ofd, num_classes=2) yd_test_byd[d_id] = to_categorical([d_id] * len(X_test_ofd), num_classes=len(DOMAINS)) X_train = np.asarray(X_train, dtype='int') ys_train = to_categorical(ys_train, num_classes=2) yd_train = to_categorical(yd_train, num_classes=len(DOMAINS)) X_val = np.asarray(X_val, dtype='int') ys_val = to_categorical(ys_val, num_classes=2) yd_val = to_categorical(yd_val, num_classes=len(DOMAINS)) # combine test data from different domains X_test = np.concatenate([X_test_byd[idx] for idx in range(len(DOMAINS))]) ys_test = np.concatenate([ys_test_byd[idx] for idx in range(len(DOMAINS))]) yd_test = np.concatenate([yd_test_byd[idx] for idx in range(len(DOMAINS))]) # shuffle train data indices = list(range(len(X_train))) np.random.shuffle(indices) X_train = X_train[indices] ys_train = ys_train[indices] yd_train = yd_train[indices] print('combined labeled data:') print(' - train:', X_train.shape, ys_train.shape, yd_train.shape) print(' - val:', X_val.shape, ys_val.shape, yd_val.shape) print(' - test:', X_test.shape, ys_test.shape, yd_test.shape) for d_id, d_name in enumerate(DOMAINS): print(' - test for {}:'.format(d_name[:3]), X_test_byd[d_id].shape, ys_test_byd[d_id].shape, yd_test_byd[d_id].shape) # embeddings print('loading word embeddings from glove') embeddings = load_glove(embedding_dim=SD.embed_dim, desired=word2index.keys(), corpus_size=SD.glove_corpus) print('processing embedding matrix') embedding_mat = get_embedding_mat(embeddings, word2index, SD.embed_dim, idx_from=2) SD.wv_weights = [embedding_mat] # inject data into SharedData for other functions SD.maxlen = maxlen SD.n_words = n_words SD.word2index = word2index SD.X_train, SD.ys_train, SD.yd_train = X_train, ys_train, yd_train SD.X_val, SD.ys_val, SD.yd_val = X_val, ys_val, yd_val SD.X_test, SD.ys_test, SD.yd_test = X_test, ys_test, yd_test SD.X_test_byd, SD.ys_test_byd, SD.yd_test_byd = X_test_byd, ys_test_byd, yd_test_byd
def mi_bert_amr(args, uncontext=False): # load data & embeddings s_train = load_data('amr_dataset', 'train') s_dev = load_data('amr_dataset', 'dev') s_test = load_data('amr_dataset', 'test') amr_s = s_train + s_dev + s_test print(amr_s[45672], amr_s[599]) graph_emb, bert_emb_paths = get_embeddings(args, amr_s) # bert_emb_paths = load_elmos(args, amr_s, dataset='amr') s_num = len(graph_emb) if uncontext: bert_emb = load_glove(args, amr_s, dataset='amr') # bert_emb = load_elmo(args, amr_s, dataset='amr') else: bert_emb = np.load(bert_emb_paths[0], allow_pickle=True) print('2.1 start to calculate baselines of MI...') # initialize mi mir, mig, mib = [], [], [] for l in range(args.bert_layers_num): mib.append([]) if args.baselines: print('3.1 start to calculate baselines of MI...') # calculate MI baselines for r in range(args.repeat): tmp_mir = mine_probe(args, graph_emb, bert_emb, s_num, 'lower') tmp_mig = mine_probe(args, graph_emb, bert_emb, s_num, 'upper') # get sum value if len(mir) == 0: mir = tmp_mir else: mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))] if len(mig) == 0: mig = tmp_mig else: mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))] print('2.2 start to calculate BERT hidden states of MI...') # calculate MI of BERT if uncontext: for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, s_num, args.bert_layers_num-1) if len(mib[-1]) == 0: mib[-1] = tmp_mib else: mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))] mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat) print('MI(G, Glove): {} |'.format(mib_layers)) else: for l in range(args.bert_layers_num): bert_emb = np.load(bert_emb_paths[l], allow_pickle=True) for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, s_num, l) if len(mib[l]) == 0: mib[l] = tmp_mib else: mib[l] = [mib[l][s]+tmp_mib[s] for s in range(len(tmp_mib))] # compute average values for all results mir = [mi/args.repeat for mi in mir] mig = [mi/args.repeat for mi in mig] for l in range(args.bert_layers_num): mib[l] = [mi/args.repeat for mi in mib[l]] # print general results results = {'lower:': mir, 'upper': mig, 'bert': mib} print('\n', results, '\n') mib_layers = [sum(mib[l])/len(mib[l]) for l in range(len(mib)) if len(mib)] print('MI(G, R): {} | MI(G, G): {}| MI(G, BERT): {} |'.format(sum( mir)/len(mir), sum(mig)/len(mig), mib_layers)) return
def train(): word_dict = load_vocab(FLAGS.vocab_data) glove = load_glove("../glove.6B.{}d.txt".format(FLAGS.embedding_size), FLAGS.embedding_size, word_dict) train = Dataset(filepath=FLAGS.train_data, num_class=FLAGS.num_class, sequence_length=FLAGS.sequence_length) valid = Dataset(filepath=FLAGS.valid_data, num_class=FLAGS.num_class, sequence_length=FLAGS.sequence_length) with tf.Graph().as_default(): session_conf = tf.compat.v1.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.compat.v1.Session(config=session_conf) with sess.as_default(): rcnn = TextRCNN(vocab_size=len(word_dict), embedding_size=FLAGS.embedding_size, sequence_length=FLAGS.sequence_length, num_class=FLAGS.num_class, cell_type=FLAGS.cell_type, hidden_size=FLAGS.hidden_size, pretrained_embeddings=glove, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define training procedure global_step = tf.compat.v1.Variable(0, name="global_step", trainable=False) train_op = tf.compat.v1.train.AdamOptimizer( FLAGS.learning_rate).minimize(rcnn.loss, global_step=global_step) acc, acc_op = tf.compat.v1.metrics.accuracy( labels=rcnn.labels, predictions=rcnn.predictions, name="metrics/acc") metrics_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_init_op = tf.compat.v1.variables_initializer( var_list=metrics_vars) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.compat.v1.summary.scalar("loss", rcnn.loss) acc_summary = tf.compat.v1.summary.scalar("accuracy", rcnn.accuracy) # Train summaries train_summary_op = tf.compat.v1.summary.merge( [loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.compat.v1.summary.FileWriter( train_summary_dir, sess.graph) # Valid summaries valid_step = 0 valid_summary_op = tf.compat.v1.summary.merge( [loss_summary, acc_summary]) valid_summary_dir = os.path.join(out_dir, "summaries", "valid") valid_summary_writer = tf.compat.v1.summary.FileWriter( valid_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=FLAGS.num_checkpoints) # initialize all variables best_valid_acc = 0.0 sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.local_variables_initializer()) # training and validating loop for epoch in range(FLAGS.num_epoch): print('-' * 100) print('\n{}> epoch: {}\n'.format( datetime.datetime.now().isoformat(), epoch)) sess.run(metrics_init_op) # Training process for batch in train.bacth_iter(FLAGS.batch_size, desc="Training", shuffle=True): labels, docs = zip(*batch) padded_docs, _, _ = vectorize(docs, FLAGS.sequence_length) feed_dict = { rcnn.inputs: padded_docs, rcnn.labels: labels, rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy, _ = sess.run([ train_op, global_step, train_summary_op, rcnn.loss, rcnn.accuracy, acc_op ], feed_dict) train_summary_writer.add_summary(summaries, step) print("\ntraining accuracy = {:.2f}\n".format( sess.run(acc) * 100)) sess.run(metrics_init_op) # Validating process for batch in valid.bacth_iter(FLAGS.batch_size, desc="Validating", shuffle=False): valid_step += 1 labels, docs = zip(*batch) padded_docs, _, _ = vectorize(docs, FLAGS.sequence_length) feed_dict = { rcnn.inputs: padded_docs, rcnn.labels: labels, rcnn.dropout_keep_prob: 1.0 } summaries, loss, accuracy, _ = sess.run( [valid_summary_op, rcnn.loss, rcnn.accuracy, acc_op], feed_dict) valid_summary_writer.add_summary(summaries, global_step=valid_step) valid_acc = sess.run(acc) * 100 print("\nvalidating accuracy = {:.2f}\n".format(valid_acc)) # model checkpoint if valid_acc > best_valid_acc: best_valid_acc = valid_acc print("current best validating accuracy = {:.2f}\n".format( best_valid_acc)) path = saver.save(sess, checkpoint_prefix) print("saved model checkpoint to {}\n".format(path)) print("{} optimization finished!\n".format( datetime.datetime.now())) print("best validating accuracy = {:.2f}\n".format(best_valid_acc))
def mi_bert_ptb(args, npeet=False, uncontext=False): # load data s_train, p_train = load_data('penn_treebank_dataset', 'train') s_dev, p_dev = load_data('penn_treebank_dataset', 'dev') s_test, p_test = load_data('penn_treebank_dataset', 'test') sentences = s_train + s_dev + s_test parsed = p_train + p_dev + p_test doc_id, sen_id, global_graph = construct_graph(parsed) s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], [] # load embeddings graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id) if uncontext: bert_emb = load_glove(args, sentences) # bert_emb = load_elmo(args, sentences) else: bert_emb_paths = bert_embeddings(args, sentences) # bert_emb_paths = load_elmos(args, sentences) bert_emb = np.load(bert_emb_paths[0], allow_pickle=True) # initialize mi mir, mig, mib = [], [], [] for l in range(args.bert_layers_num): mib.append([]) for s in range(len(sentences)): mir.append(0.) mig.append(0.) for l in range(args.bert_layers_num): mib[l].append(0.) if args.baselines: print('3.1 start to calculate baselines of MI...') # calculate MI baselines for r in range(args.repeat): tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower') tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper') # get sum value mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))] mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))] print('3.2 start to calculate BERT hidden states of MI...') if uncontext: for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), args.bert_layers_num - 1) mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))] mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat) print('MI(G, Glove): {} |'.format(mib_layers)) else: # calculate MI of BERT for l in range(args.bert_layers_num): bert_emb = np.load(bert_emb_paths[l], allow_pickle=True) for r in range(args.repeat): tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), l) mib[l] = [mib[l][s]+tmp_mib[s] for s in range(len(tmp_mib))] # compute average values for all results mir = [mi/args.repeat for mi in mir] mig = [mi/args.repeat for mi in mig] for l in range(args.bert_layers_num): mib[l] = [mi/args.repeat for mi in mib[l]] mib_layers = [sum(mib[l])/len(mib[l]) for l in range(len(mib))] # print general results results = {'lower:': mir, 'upper': mig, 'bert': mib} # print('\n', results, '\n') print('MI(G, R): {} | MI(G, G): {}| MI(G, BERT): {} |'.format(sum( mir)/len(mir), sum(mig)/len(mig), mib_layers)) return
def main(_): vocab = read_vocab('data/yelp-2015-w2i.pkl') glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size), FLAGS.emb_size, vocab) data_reader = DataReader(train_file='data/yelp-2015-train.pkl', dev_file='data/yelp-2015-dev.pkl', test_file='data/yelp-2015-test.pkl') config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) with tf.Session(config=config) as sess: model = Model(cell_dim=FLAGS.cell_dim, att_dim=FLAGS.att_dim, vocab_size=len(vocab), emb_size=FLAGS.emb_size, num_classes=FLAGS.num_classes, dropout_rate=FLAGS.dropout_rate, pretrained_embs=glove_embs) loss = loss_fn(model.labels, model.logits) train_op, global_step = train_fn(loss) batch_acc, total_acc, acc_update, metrics_init = eval_fn( model.labels, model.logits) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints) print('\n{}> Start training'.format(datetime.now())) epoch = 0 valid_step = 0 test_step = 0 train_test_prop = len(data_reader.train_data) / len( data_reader.test_data) test_batch_size = int(FLAGS.batch_size / train_test_prop) best_acc = float('-inf') while epoch < FLAGS.num_epochs: epoch += 1 print('\n{}> Epoch: {}'.format(datetime.now(), epoch)) sess.run(metrics_init) for batch_docs, batch_labels in data_reader.read_train_set( FLAGS.batch_size, shuffle=True): _step, _, _loss, _acc, _ = sess.run( [global_step, train_op, loss, batch_acc, acc_update], feed_dict=model.get_feed_dict(batch_docs, batch_labels, training=True)) if _step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) train_writer.add_summary(_summary, global_step=_step) print('Training accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) sess.run(metrics_init) for batch_docs, batch_labels in data_reader.read_valid_set( test_batch_size): _loss, _acc, _ = sess.run([loss, batch_acc, acc_update], feed_dict=model.get_feed_dict( batch_docs, batch_labels)) valid_step += 1 if valid_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) valid_writer.add_summary(_summary, global_step=valid_step) print('Validation accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) sess.run(metrics_init) for batch_docs, batch_labels in data_reader.read_test_set( test_batch_size): _loss, _acc, _ = sess.run([loss, batch_acc, acc_update], feed_dict=model.get_feed_dict( batch_docs, batch_labels)) test_step += 1 if test_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) test_writer.add_summary(_summary, global_step=test_step) test_acc = sess.run(total_acc) * 100 print('Testing accuracy = {:.2f}'.format(test_acc)) if test_acc > best_acc: best_acc = test_acc saver.save(sess, FLAGS.checkpoint_dir) print('Best testing accuracy = {:.2f}'.format(test_acc)) print("{} Optimization Finished!".format(datetime.now())) print('Best testing accuracy = {:.2f}'.format(best_acc))
def main(restore=False): word_embedding = None word_embedding_file = 'squad/word_embedding.pkl' if u.check_file(word_embedding_file): word_embedding = utils.load_file(word_embedding_file) else: print("==> Load vectors ...") vocabs = None ft_vc = 'squad/vocabs_fine_tuned.pkl' if u.check_file(ft_vc): vocabs = u.load_file(ft_vc) else: raise ValueError("Please check vocabs fine-tuned file") word2vec = utils.load_glove() word_embedding = np.zeros((len(vocabs), p.embed_size)) for index, v in enumerate(vocabs): if v in word2vec: word_embedding[index] = word2vec[v] else: word_embedding[index] = pr.create_vector( v, word2vec, p.embed_size) del word2vec utils.save_file('squad/word_embedding.pkl', word_embedding) print("==> Done vectors ") # init word embedding contexts, contexts_len, questions, questions_len, answers, answers_len, start, end = utils.load_file( 'squad/doc_train_idx%s.pkl' % p.doc_suffix) data_len = int(np.floor(0.9 * len(contexts))) train = contexts[:data_len], contexts_len[:data_len], questions[:data_len], \ questions_len[:data_len], start[:data_len], end[:data_len] dev = contexts[data_len:], contexts_len[data_len:], questions[data_len:], \ questions_len[data_len:], start[data_len:], end[data_len:] config = Config() # config.strong_supervision = True model = SquadSkim(config) model.set_data(train, dev, word_embedding, np.shape(contexts)[1], np.shape(questions)[1]) # model.set_encoding() model.init_ops() # tf.reset_default_graph() print('Start training DMN on squad') # model.init_data_node() best_overall_val_loss = float('inf') # create model tconfig = tf.ConfigProto(allow_soft_placement=True) with tf.device('/%s' % p.device): print('==> initializing variables') init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config=tconfig) as session: sum_dir = 'summaries/train_squad/' + time.strftime("%Y-%m-%d %H %M") if not utils.check_file(sum_dir): os.makedirs(sum_dir) train_writer = tf.summary.FileWriter(sum_dir, session.graph) session.run(init) best_val_epoch = 0 prev_epoch_loss = float('inf') best_val_loss = float('inf') best_val_accuracy = 0.0 if restore: print('==> restoring weights') saver.restore(session, 'weights/squad.weights') print('==> starting training') for epoch in range(config.max_epochs): print('Epoch {}'.format(epoch)) start = time.time() train_loss, train_accuracy = model.run_epoch( session, model.train, epoch, train_writer, train_op=model.train_step, train=True) valid_loss, valid_accuracy = model.run_epoch(session, model.valid) print('Training loss: {}'.format(train_loss)) print('Validation loss: {}'.format(valid_loss)) print('Training accuracy: {}'.format(train_accuracy)) print('Validation accuracy: {}'.format(valid_accuracy)) if valid_loss < best_val_loss: best_val_loss = valid_loss best_val_epoch = epoch if best_val_loss < best_overall_val_loss: print('Saving weights') best_overall_val_loss = best_val_loss saver.save(session, 'weights/squad.weights') # anneal if train_loss > prev_epoch_loss * model.config.anneal_threshold: model.config.lr /= model.config.anneal_by print('annealed lr to %f' % model.config.lr) if best_val_accuracy < valid_accuracy: best_val_accuracy = valid_accuracy prev_epoch_loss = train_loss if epoch - best_val_epoch > config.early_stopping: break print('Total time: {}'.format(time.time() - start)) print('Best validation accuracy:', best_val_accuracy)
def main(_): # load the word_to_index encoded vocabulary vocab = read_vocab(FLAGS.vocab) # create embedding matrix of size (vocab,emb_size) glove_embs = load_glove(FLAGS.embedding_file, FLAGS.emb_size, vocab) print('input embeddings shape: ', glove_embs.shape) # read data data_reader = DataReader(train_file=FLAGS.train_data_file, dev_file=FLAGS.dev_data_file, test_file=FLAGS.test_data_file, num_classes=FLAGS.num_classes) config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) tf.reset_default_graph() sess = tf.Session(config=config) model = Model(cell_dim=FLAGS.cell_dim, att_dim=FLAGS.att_dim, vocab_size=len(vocab), emb_size=FLAGS.emb_size, num_classes=FLAGS.num_classes, dropout_rate=FLAGS.dropout_rate, pretrained_embs=glove_embs) # calculate loss loss = loss_fn(model.labels, model.logits) total_loss, loss_update = tf.metrics.mean(loss, name='metrics/losss') # calculates gradients train_op, global_step = train_fn(loss) # calculates metrics and merges all batch_acc, total_acc, acc_update, metrics_init = eval_fn( model.labels, model.logits) summary_op = tf.summary.merge_all() summary_total = tf.summary.merge([ tf.summary.scalar('total_batch_accuracy', total_acc), tf.summary.scalar("total_batch_loss", total_loss) ]) sess.run(tf.global_variables_initializer()) # The graph described by sess.graph will be displayed by TensorBoard train_writer.add_graph(sess.graph) # save all variables saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints) print('\n{}> Start training'.format(datetime.now())) epoch = 0 valid_step = 0 test_step = 0 train_test_prop = len(data_reader.train_data) / len(data_reader.test_data) test_batch_size = int(FLAGS.batch_size / train_test_prop) best_acc = float('-inf') while epoch < FLAGS.num_epochs: epoch += 1 print('\n{}> Epoch: {}'.format(datetime.now(), epoch)) # we newly initialize metrics tensors each epoch, each evaluation sess.run(metrics_init) # each data point/doc in batch contains a list of sentences, encoded with index for batch_docs, batch_labels in data_reader.read_train_set( FLAGS.batch_size, shuffle=True): # do a batch _step, _, _loss, _acc = sess.run( [global_step, train_op, loss, batch_acc], feed_dict=model.get_feed_dict(batch_docs, batch_labels, training=True)) # each display_step steps evaluate metric variables and add to train_writer, training is false to disables dropout if _step % FLAGS.display_step == 0: # _summary, _loss_save, _acc_save, _, _ = sess.run( [summary_op, loss, batch_acc, acc_update, loss_update], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) train_writer.add_summary(_summary, global_step=_step) last_step_epoch = _step # evaluate avg batch metrics total_acc_train, total_loss_train, summary_total_train = sess.run( [total_acc, total_loss, summary_total]) train_writer.add_summary(summary_total_train, global_step=last_step_epoch) mlflow.log_metrics( { 'avg_batch_accuracy': total_acc_train, 'avg_batch_loss': total_loss_train }, step=last_step_epoch) print('Avg training accuracy = {:.2f}'.format(total_acc_train)) print('Avg training loss = {:.2f}'.format(total_loss_train)) # we newly initialize metrics tensors each epoch, each evaluation sess.run(metrics_init) # for each epoch calculate metrics for valid set for batch_docs, batch_labels in data_reader.read_valid_set( test_batch_size): _loss, _acc, _, _ = sess.run( [loss, batch_acc, acc_update, loss_update], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) total_acc_valid, total_loss_valid, summary_total_valid = sess.run( [total_acc, total_loss, summary_total]) valid_writer.add_summary(summary_total_train, global_step=last_step_epoch) mlflow.log_metrics( { 'avg_valid_accuracy': total_acc_valid, 'avg_valid_loss': total_loss_valid }, step=last_step_epoch) print('Avg validation accuracy = {:.2f}'.format(total_acc_valid)) print('Avg validation loss = {:.2f}'.format(total_loss_valid)) # we newly initialize metrics tensors each epoch, each evaluation sess.run(metrics_init) # for each epoch calculate metrics for test set for batch_docs, batch_labels in data_reader.read_test_set( test_batch_size): _loss, _acc, _, _ = sess.run( [loss, batch_acc, acc_update, loss_update], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) total_acc_test, total_loss_test, summary_total_test = sess.run( [total_acc, total_loss, summary_total]) test_writer.add_summary(summary_total_test, global_step=last_step_epoch) mlflow.log_metrics( { 'avg_test_accuracy': total_acc_test, 'avg_test_loss': total_loss_test }, step=last_step_epoch) print('Avg validation accuracy = {:.2f}'.format(total_acc_test)) print('Avg validation loss = {:.2f}'.format(total_loss_test)) # keep track of best test accuracy, if epoch improved, save all variables if total_acc_test > best_acc: best_acc = total_acc_test saver.save(sess, FLAGS.checkpoint_dir) print('Best testing accuracy = {:.2f}'.format(test_acc)) print("{} Optimization Finished!".format(datetime.now())) print('Best testing accuracy = {:.2f}'.format(best_acc))
maxlen = 50 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(texts) sequences = pad_sequences(sequences, maxlen=maxlen) tokenizer_tag = Tokenizer() tokenizer_tag.fit_on_texts(tags) tags = tokenizer_tag.texts_to_sequences(tags) tags = np.array(list((map(lambda x: x[0], tags)))) tags = to_categorical(tags) # load embedding emb_matrix = load_glove(word_index) # Get test/problem/treatment matrix: info_matrix # info_matrix: (m,3,maxlen), which uses one-hot to indicate the entity property of the token targets = ['test_info', 'problem_info', 'treatment_info'] info_matrix = np.zeros((sequences.shape[0], 3, maxlen)) for i, target in enumerate(targets): for k, j in train_df[target].str.extract('(\d+)\|(\d+)').iterrows(): if not pd.isnull(j[0]): info_matrix[k, i, int(j[0]) - 1:int(j[1])] = 1 # Shuffle the data np.random.seed(2019) index = np.random.permutation(len(sequences))
train_dataset = entity_linking_v3(train_part, t) valid_dataset = entity_linking_v3(valid_part, t) batch_size = 1 # 准备embedding数据 embedding_file = 'embedding/miniembedding_baike_link.npy' #embedding_file = 'embedding/miniembedding_engineer_qq_att.npy' if os.path.exists(embedding_file): embedding_matrix = np.load(embedding_file) else: #embedding = '/home/zhukaihua/Desktop/nlp/embedding/baike' embedding = '/home/zhu/Desktop/word_embedding/sgns.baidubaike.bigram-char' #embedding = '/home/zhukaihua/Desktop/nlp/embedding/Tencent_AILab_ChineseEmbedding.txt' embedding_matrix = load_glove(embedding, t.num_words + 100, t) np.save(embedding_file, embedding_matrix) train_batch_size = 1 valid_batch_size = 1 model = EntityLink_v3(vocab_size=embedding_matrix.shape[0], encoder_size=128, dropout=0.5, init_embedding=embedding_matrix) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn_linking_v3,
from model_bi_attention import SentenceSelector import options import pdb options = options.CoqaOptions() torch.cuda.set_device(0) device = torch.device('cuda:{}'.format(options.gpu)) print("Reading data pickles") train_data = utils.unpickler(options.data_pkl_path, options.train_pkl_name) dev_data = utils.unpickler(options.data_pkl_path, options.dev_pkl_name) glove = utils.load_glove(options.data_pkl_path, options.glove_store) # pdb.set_trace() print("Building model") model = SentenceSelector(options, glove, device) model.to(device) print("===============================") print("Model:") print(model) print("===============================") criterion = nn.CrossEntropyLoss()
def train(): with tf.device('/cpu:0'): train_text, train_y, train_pos1, train_pos2, train_x_text_clean, train_sentence_len = data_helpers.load_data_and_labels( FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_pos1, test_pos2, test_x_text_clean, test_sentence_len = data_helpers.load_data_and_labels( FLAGS.test_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length # print("text:",x_text) vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) train_text = np.array(train_text) print("train_text", train_text[0:2]) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format( len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) # (8000,90) print("train_y = {0}".format(train_y.shape)) # (8000,19) print("test_x = {0}".format(test_x.shape)) # (2717, 90) print("test_y = {0}".format(test_y.shape)) # (2717,19) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) # (8000, 90) print("test_p1 = {0}".format(test_p1.shape)) # (2717, 90) print("") # Randomly shuffle data to split into train and test(dev) # np.random.seed(10) # # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000 # x_shuffled = x[shuffle_indices] # p1_shuffled = p1[shuffle_indices] # p2_shuffled = p2[shuffle_indices] # y_shuffled = y[shuffle_indices] # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800 # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:] # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:] # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # print(x_train) # print(np.array(x_train)) # print(x_dev) # print(np.array(x_dev)) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=FLAGS.max_sentence_length, #90 num_classes=train_y.shape[1], #19 text_vocab_size=len(vocab_processor.vocabulary_), #19151 text_embedding_size=FLAGS.text_embedding_size, #300 pos_vocab_size=len(pos_vocab_processor.vocabulary_), #162 pos_embedding_size=FLAGS.pos_embedding_dim, #50 filter_sizes=list(map( int, FLAGS.filter_sizes.split(","))), #2,3,4,5 num_filters=FLAGS.num_filters, #128 l2_reg_lambda=FLAGS.l2_reg_lambda, #1e-5 use_elmo=(FLAGS.embeddings == 'elmo')) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(cnn.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.embeddings == "word2vec": pretrain_W = utils.load_word2vec( 'resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") elif FLAGS.embeddings == "glove100": pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove100 model!\n") elif FLAGS.embeddings == "glove300": pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helpers.batch_iter( list(zip(train_x, train_y, train_text, train_p1, train_p2)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_by, train_btxt, train_bp1, train_bp2 = zip( *train_batch) # print("train_bxt",list(train_btxt)[:2]) # print(np.array(train_be1).shape) #(20, ) # print(train_be1) feed_dict = { cnn.input_text: train_bx, cnn.input_y: train_by, cnn.input_x_text: list(train_btxt), cnn.input_p1: train_bp1, cnn.input_p2: train_bp2, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helpers.batch_iter(list( zip(test_x, test_y, test_text, test_p1, test_p2)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by, test_btxt, test_bp1, test_bp2 = zip( *test_batch) feed_dict = { cnn.input_text: test_bx, cnn.input_y: test_by, cnn.input_x_text: list(test_btxt), cnn.input_p1: test_bp1, cnn.input_p2: test_bp2, cnn.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
if __name__ == '__main__': DATA_ROOT = os.path.join(os.path.dirname(__file__), os.environ["data_dir"]) EMBEDDING_PATH = os.path.join(os.path.dirname(__file__), os.environ["glove_dir"]) train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) x_test, y_test = load_data_and_labels(test_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') print(len(x_test), 'test sequences') embeddings = load_glove(EMBEDDING_PATH) # Use pre-trained word embeddings model = Sequence(cell_type=os.environ['cell_type'], embeddings=embeddings, initial_vocab=embeddings.keys()) # print(model.trainable_weights) model.fit(x_train, y_train, x_valid, y_valid, epochs=30) print('Testing the model...') print(model.score(x_test, y_test))
def _define_global(glove_file): global glove6b300d glove6b300d = load_glove(glove_file, verbose=0)
def train(): with tf.device('/cpu:0'): train_text, train_y, train_e1, train_e2, train_pos1, train_pos2, train_rw, train_rw_pos, train_rw_cate = data_helpers.load_data_and_labels( FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_e1, test_e2, test_pos1, test_pos2, test_rw, test_rw_pos, test_rw_cate = data_helpers.load_data_and_labels( FLAGS.test_path) #words = data_helpers.relation_words([train_between_e, test_between_e]) #train_relation_words_between_entity = data_helpers.relation_words_between_entity(train_between_e, words) #test_relation_words_between_entity = data_helpers.relation_words_between_entity(test_between_e, words) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = MAX_SENTENCE_LENGTH vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) train_text = np.array(train_text) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format( len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) print("train_y = {0}".format(train_y.shape)) print("test_x = {0}".format(test_x.shape)) print("test_y = {0}".format(test_y.shape)) vocab_processor2 = tf.contrib.learn.preprocessing.VocabularyProcessor(6) vocab_processor2.fit(train_rw + test_rw) train_rw_x = np.array(list(vocab_processor2.transform(train_rw))) test_rw_x = np.array(list(vocab_processor2.transform(test_rw))) train_rw_text = np.array(train_rw) test_rw_text = np.array(test_rw) vocab_processor2pos = tf.contrib.learn.preprocessing.VocabularyProcessor(6) vocab_processor2pos.fit(train_rw_pos + test_rw_pos) train_rw_pos_x = np.array(list( vocab_processor2pos.transform(train_rw_pos))) test_rw_pos_x = np.array(list(vocab_processor2pos.transform(test_rw_pos))) train_rw_pos_text = np.array(train_rw_pos) test_rw_pos_text = np.array(test_rw_pos) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) print("test_p1 = {0}".format(test_p1.shape)) print("") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): model = EntityAttentionLSTM( sequence_length=train_x.shape[1], rw_length=6, num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), rw_vocab_size=len(vocab_processor2.vocabulary_), rw_pos_vocab_size=len(vocab_processor2pos.vocabulary_), embedding_size=FLAGS.embedding_size, pos_vocab_size=len(pos_vocab_processor.vocabulary_), pos_embedding_size=FLAGS.pos_embedding_size, hidden_size=FLAGS.hidden_size, num_heads=FLAGS.num_heads, attention_size=FLAGS.attention_size, use_elmo=(FLAGS.embeddings == 'elmo'), l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(model.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) vocab_processor2.save(os.path.join(out_dir, "rw_vocab")) vocab_processor2pos.save(os.path.join(out_dir, "rw_pos_vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.embeddings == "word2vec": pretrain_W = utils.load_word2vec( 'resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor) sess.run(model.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") elif FLAGS.embeddings == "glove100": pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) sess.run(model.W_text.assign(pretrain_W)) print("Success to load pre-trained glove100 model!\n") elif FLAGS.embeddings == "glove300": pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) pretrain_rw_W = utils.load_glove( 'resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor2) sess.run(model.W_text.assign(pretrain_W)) sess.run(model.W_rw_text.assign(pretrain_rw_W)) print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helpers.batch_iter( list( zip(train_x, train_y, train_text, train_e1, train_e2, train_p1, train_p2, train_rw_x, train_rw_text, train_rw_pos_x, train_rw_pos_text, train_rw_cate)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2, train_brw_x, train_brw_text, train_brw_pos_x, train_brw_pos_text, train_brw_cate = zip( *train_batch) feed_dict = { model.input_x: train_bx, model.input_y: train_by, model.input_text: train_btxt, model.input_e1: train_be1, model.input_e2: train_be2, model.input_p1: train_bp1, model.input_p2: train_bp2, model.input_rw_x: train_brw_x, ######## model.input_rw_text: train_brw_text, ########## model.input_rw_pos_x: train_brw_pos_x, ####### model.input_rw_pos_text: train_brw_pos_text, ####### model.input_rw_cate: train_brw_cate, ########### model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob, model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob, model.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helpers.batch_iter(list( zip(test_x, test_y, test_text, test_e1, test_e2, test_p1, test_p2, test_rw_x, test_rw_text, test_rw_pos_x, test_rw_pos_text, test_rw_cate)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2, test_brw_x, test_brw_text, test_brw_pos_x, test_brw_pos_text, test_brw_cate = zip( *test_batch) feed_dict = { model.input_x: test_bx, model.input_y: test_by, model.input_text: test_btxt, model.input_e1: test_be1, model.input_e2: test_be2, model.input_p1: test_bp1, model.input_p2: test_bp2, model.input_rw_x: test_brw_x, ######## model.input_rw_text: test_brw_text, ########## model.input_rw_pos_x: test_brw_pos_x, ####### model.input_rw_pos_text: test_brw_pos_text, ####### model.input_rw_cate: test_brw_cate, ######### model.emb_dropout_keep_prob: 1.0, model.rnn_dropout_keep_prob: 1.0, model.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [model.loss, model.accuracy, model.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
parser.add_argument('--save_every', type=int, default=1, help='save state every x epoch') parser.add_argument('--prefix', type=str, default="", help='optional prefix of network name') parser.add_argument('--no-shuffle', dest='shuffle', action='store_false') parser.add_argument('--babi_test_id', type=int, default=-1, help='babi_id of test set') parser.set_defaults(shuffle=True) args = parser.parse_args() assert args.word_vector_size in [50, 100, 200, 300] network_name = args.prefix + '%s.mh%d.n%d.bs%d%s.babi%s' % (args.network, args.memory_hops, args.dim, args.batch_size, ".na" if args.normalize_attention else "", args.babi_id) babi_train_raw, babi_test_raw = utils.get_babi_raw(args.babi_id, args.babi_test_id) word2vec = utils.load_glove(args.word_vector_size) args_dict = dict(args._get_kwargs()) args_dict['babi_train_raw'] = babi_train_raw args_dict['babi_test_raw'] = babi_test_raw args_dict['word2vec'] = word2vec # init class if args.network == 'dmn_batch': import dmn_batch dmn = dmn_batch.DMN_batch(**args_dict) elif args.network == 'dmn_basic': import dmn_basic if (args.batch_size != 1):
def main(_): vocab = read_vocab('data/ICLR_Review_all_with_decision-w2i.pkl') glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size), FLAGS.emb_size, vocab) data_reader = DataReader( train_file='data/ICLR_Review_all_with_decision-train.pkl', dev_file='data/ICLR_Review_all_with_decision-dev.pkl', test_file='data/ICLR_Review_all_with_decision-test.pkl') config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) with tf.Session(config=config) as sess: model = Model(cell_dim=FLAGS.cell_dim, att_dim=FLAGS.att_dim, vocab_size=len(vocab), emb_size=FLAGS.emb_size, num_classes=FLAGS.num_classes, dropout_rate=FLAGS.dropout_rate, pretrained_embs=glove_embs) loss = loss_fn(model.labels, model.logits) train_op, global_step = train_fn(loss) batch_acc, total_acc, acc_update, metrics_init, predictions = eval_fn( model.labels, model.logits) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints) print('\n{}> Start training'.format(datetime.now())) result_save_folder = str(datetime.now()) output_folder = os.path.join('.', 'output') create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, result_save_folder) # Folder where to save graphs create_folder_if_not_exists(stats_graph_folder) epoch = 0 valid_step = 0 test_step = 0 train_test_prop = len(data_reader.train_data) / len( data_reader.test_data) test_batch_size = int(FLAGS.batch_size / train_test_prop) best_acc = float('-inf') while epoch < FLAGS.num_epochs: epoch += 1 print('\n{}> Epoch: {}'.format(datetime.now(), epoch)) sess.run(metrics_init) all_labels = [] all_y_pred = [] for batch_docs, batch_labels in data_reader.read_train_set( FLAGS.batch_size, shuffle=True): _step, _, _loss, _acc, _, y_pred_batch = sess.run( [ global_step, train_op, loss, batch_acc, acc_update, predictions ], feed_dict=model.get_feed_dict(batch_docs, batch_labels, training=True)) all_labels += batch_labels #y_pred_batch_array = y_pred_batch.eval(session=sess) y_pred_batch_list = y_pred_batch.tolist() all_y_pred += y_pred_batch_list if _step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) train_writer.add_summary(_summary, global_step=_step) print('Training accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) save_results(all_labels, all_y_pred, stats_graph_folder, 'train', epoch) sess.run(metrics_init) all_valid_labels = [] all_valid_y_pred = [] for batch_docs, batch_labels in data_reader.read_valid_set( test_batch_size): _loss, _acc, _, valid_y_pred_batch = sess.run( [loss, batch_acc, acc_update, predictions], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) all_valid_labels += batch_labels valid_y_pred_batch_list = valid_y_pred_batch.tolist() all_valid_y_pred += valid_y_pred_batch_list valid_step += 1 if valid_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) valid_writer.add_summary(_summary, global_step=valid_step) print('Validation accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) #save_optimized_presicion(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) #save_distance_measure(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) save_results(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) sess.run(metrics_init) all_test_labels = [] all_test_y_pred = [] for batch_docs, batch_labels in data_reader.read_test_set( test_batch_size): _loss, _acc, _, test_y_pred_batch = sess.run( [loss, batch_acc, acc_update, predictions], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) all_test_labels += batch_labels test_y_pred_batch_list = test_y_pred_batch.tolist() all_test_y_pred += test_y_pred_batch_list test_step += 1 if test_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) test_writer.add_summary(_summary, global_step=test_step) test_acc = sess.run(total_acc) * 100 print('Testing accuracy = {:.2f}'.format(test_acc)) #save_optimized_presicion(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) #save_distance_measure(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) save_results(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) if test_acc > best_acc: best_acc = test_acc saver.save(sess, FLAGS.checkpoint_dir) print('Best testing accuracy = {:.2f}'.format(best_acc)) print("{} Optimization Finished!".format(datetime.now())) print('Best testing accuracy = {:.2f}'.format(best_acc))