def main(_): # Do what you need to load datasets from FLAGS.data_dir train_data, val_data = load_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = tf.constant(load_embeddings(embed_path), tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.summary_flag, FLAGS.max_context_len, FLAGS.max_question_len) decoder = Decoder(FLAGS.state_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embeddings, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir)
def main(_): #======Fill the model name============= train_dir = "train/test" #====================================== vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) # ========= Load Dataset ========= train_data,val_data = load_and_preprocess_data(FLAGS.data_dir, FLAGS.max_context_len, FLAGS.max_question_len, size = FLAGS.train_size) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embedding = tf.constant(load_embeddings(embed_path), dtype = tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.embedding_size, FLAGS.summary_flag, FLAGS.filter_flag) decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.output_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab) with tf.Session() as sess: train_dir = get_normalized_train_dir(train_dir) qa = initialize_model(sess, qa, train_dir) output_list, output_dict = generate_answers(sess, qa, val_data, rev_vocab) store_result(output_list, output_dict, train_dir)
def main(): # read pre-trained embeddings embeddings = load_embeddings(embedding_path, 'word2vec') test_accus = [] # Collect test accuracy for each fold for i in xrange(n_folds): fold = i + 1 logging.info('Fold {} of {}...'.format(fold, n_folds)) # read data train_data, train_labels, test_data, test_labels, seq_len, vocab_size = load_data_MR( data_path, fold=fold) # update train directory according to fold number train_dir = base_train_dir + '/' + str(fold) # create train directory if not exist if not os.path.exists(train_dir): os.makedirs(train_dir) # create log file handler file_handler = logging.FileHandler(pjoin(train_dir, "log.txt")) logging.getLogger().addHandler(file_handler) # check whether the model has been trained, if not, create a new one if os.path.exists(train_dir + '/model.json'): # load json and create model json_file = open(train_dir + '/model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new model model.load_weights(train_dir + "/model.h5") model.compile(loss={'output': 'binary_crossentropy'}, optimizer=Adadelta(lr=base_lr, epsilon=1e-6, decay=decay_rate), metrics=["accuracy"]) print("Loaded model from disk!") else: model = setup_model(embeddings, seq_len, vocab_size) print("Created a new model!") # train the model test_accu = train(model, train_data, train_labels, test_data, test_labels, embeddings, train_dir) # log test accuracy result logging.info("\nTest Accuracy for fold {}: {}".format(fold, test_accu)) test_accus.append(test_accu) # write log of test accuracy for all folds test_accu_log = open(base_train_dir + "/final_test_accuracy.txt", 'w') test_accu_log.write('\n'.join([ 'Fold {} Test Accuracy: {}'.format(fold, test_accu) for fold, test_accu in enumerate(test_accus) ])) test_accu_log.write('\nAvg test acc: {}'.format(np.mean(test_accus)))
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = (context_data, question_data, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) embedding = tf.constant(load_embeddings(embed_path), dtype=tf.float32) encoder = Encoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.embedding_size, FLAGS.summary_flag, FLAGS.filter_flag) decoder = Decoder(FLAGS.state_size, FLAGS.max_context_len, FLAGS.max_question_len, FLAGS.output_size, FLAGS.summary_flag) qa = QASystem(encoder, decoder, FLAGS, embedding, rev_vocab) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def __init__(self, token_vocab: Vocabulary, tag_vocab: Vocabulary, embeddings: Dict, encoder: Dict, tag_projection: Dict): super(NeuralCrf, self).__init__() self._embeddings = load_embeddings(**embeddings, token_vocab=token_vocab) self._encoder = load_object_from_dict(encoder) self._tag_projection = load_object_from_dict(tag_projection) self.token_vocab = token_vocab self.tag_vocab = tag_vocab self.num_tags = len(self.tag_vocab) assert self.num_tags == self._tag_projection.out_features self.crf = ConditionalRandomField(self.num_tags) self.metrics = { 'accuracy': Accuracy(), 'accuracy_per_label': AccuracyPerLabel(self.num_tags, self.tag_vocab), 'loss': Average() }
def __init__(self, token_vocab: Vocabulary, tag_vocab: Vocabulary, embeddings: Dict, encoder: Dict, tag_projection: Dict): super(SimpleTagger, self).__init__() self._embeddings = load_embeddings(**embeddings, token_vocab=token_vocab) self._encoder = load_object_from_dict(encoder) self._tag_projection = load_object_from_dict(tag_projection) self.token_vocab = token_vocab self.tag_vocab = tag_vocab self.num_tags = len(self.tag_vocab) assert self.num_tags == self._tag_projection.out_features self.loss = torch.nn.CrossEntropyLoss( ignore_index=self.tag_vocab.pad_token_id) self.metrics = { 'accuracy': Accuracy(), 'accuracy_per_label': AccuracyPerLabel(self.num_tags, self.tag_vocab), 'loss': Average() }
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') embeddings = util.load_embeddings(args) # Get model log.info('Building model...') model = BiDAF(embeddings=embeddings, hidden_size=args.hidden_size) if not args.use_slqa else SLQA( embeddings=embeddings, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) if args.use_char_emb: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) else: cc_idx = None qc_idxs = None # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) breakpoint() results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
tf.app.flags.DEFINE_float("keep_prob", 1.0, "Keep probability for dropout.") tf.app.flags.DEFINE_integer('checkpoint', 1000, 'number of batches until checkpoint.') tf.app.flags.DEFINE_integer('num_copies', 1, 'number of copies for associative RNN.') tf.app.flags.DEFINE_integer('num_read_keys', 0, 'number of additional read keys for associative RNN.') tf.app.flags.DEFINE_string("result_file", None, "Where to write results.") tf.app.flags.DEFINE_string("moru_ops", 'max,mul,keep,replace,diff,min,forget', "operations of moru cell.") tf.app.flags.DEFINE_string("moru_op_biases", None, "biases of moru operations at beginning of training. " "Defaults to 0 for each.") tf.app.flags.DEFINE_integer("moru_op_ctr", None, "Size of op ctr. By default ops are controlled by current input" "and previous state. Given a positive integer, an additional" "recurrent op ctr is introduced in MORUCell.") tf.app.flags.DEFINE_boolean('eval', False, 'only evaluation') tf.app.flags.DEFINE_string('model_path', '/tmp/snli-model', 'only evaluation') tf.app.flags.DEFINE_string('device', '/gpu:0', 'device to run on') FLAGS = tf.app.flags.FLAGS kwargs = None if FLAGS.embedding_format == "glove": kwargs = {"vocab_size": 2196017, "dim": 300} print("Loading embeddings...") e = util.load_embeddings(FLAGS.embedding_file, FLAGS.embedding_format) print("Done.") import json print("Configuration: ") print(json.dumps(FLAGS.__flags, sort_keys=True, indent=2, separators=(',', ': '))) training(e, FLAGS)
def main(args): """Main entry point for training""" # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: %s', format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) _init_random_seed(args, log) # Get embeddings log.info('Loading embeddings...') embeddings = util.load_embeddings(args) # Get model log.info('Building model...') model = BiDAF(embeddings=embeddings, hidden_size=args.hidden_size, drop_prob=args.drop_prob) if not args.use_slqa else SLQA( embeddings=embeddings, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from %s...', args.load_path) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch %s...', epoch) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, dummy_ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) if args.use_char_emb: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) else: cc_idxs = None qc_idxs = None optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step %s...', step) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args.use_char_emb) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev %s', results_str) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def parse_args(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--prepare', action='store_true') parser.add_argument('--train', action='store_true') parser.add_argument('--test', action='store_true') return parser.parse_args() if __name__ == '__main__': args = parse_args() if args.prepare: filter_glove() if args.train or args.test: weight_matrix, word_idx = load_embeddings( PathConfig.filtered_glove_path) if args.train: train_trees = load_trees(PathConfig.train_path) dev_trees = load_trees(PathConfig.dev_path) if args.test: test_trees = load_trees(PathConfig.test_path) if args.train: tree_lstm_model = TreeLSTMModel(weight_matrix, word_idx, ModelConfig) train_set = tree_lstm_model.compiler.build_loom_inputs(train_trees) dev_feed_dict = tree_lstm_model.compiler.build_feed_dict(dev_trees)
state =np.zeros((1, model.config.state_size*model.config.n_layers)) sent = 0 with open('output/generated_{}_{:%Y%m%d_%H%M%S}.txt'.format(country_code, datetime.now()),'w') as f: f.write(word+' ') while sent < n_sents: w_input, state = model.random_search(sess,w_input, state,top_n,word_in_vocab) w_char = helper.id2tok[w_input] print(w_char, end = ' ') if w_char == '<EOS>': f.write('\n') sent+=1 else: f.write(w_char+' ') w_input = np.array([[w_input]]) word_in_vocab = True print('"') print('Generated text saved to output') if __name__ =="__main__": country_code ='FRA' helper, data_raw = util.load_and_preprocess_data(data_path='../input/un-general-debates.csv',country_code=country_code) embedding = util.load_embeddings(country_code) assert embedding is not None, 'No pretrained embeddings found. Use skipgram.py to train word embeddings' #do_train(country_code,embedding,helper, data_raw) generate_w_random_search(country_code,embedding, helper, 'palestine',20,5)
def do_train(train_bodies, train_stances, dimension, embedding_path, config, max_headline_len=None, max_body_len=None, verbose=False, include_stopwords=True, similarity_metric_feature=None, weight_embeddings=False, idf=False): logging.info("Loading training and dev data ...") fnc_data, fnc_data_train, fnc_data_dev = util.load_and_preprocess_fnc_data( train_bodies, train_stances, include_stopwords, similarity_metric_feature) logging.info("%d training examples", len(fnc_data_train.headlines)) logging.info("%d dev examples", len(fnc_data_dev.headlines)) if max_headline_len is None: max_headline_len = fnc_data_train.max_headline_len if max_body_len is None: max_body_len = fnc_data_train.max_body_len logging.info("Max headline length: %d", max_headline_len) logging.info("Max body length: %d", max_body_len) # For convenience, create the word indices map over the entire dataset logging.info("Building word-to-index map ...") corpus = ([w for bod in fnc_data.bodies for w in bod] + [w for headline in fnc_data.headlines for w in headline]) word_indices = util.process_corpus(corpus) logging.info("Building embedding matrix ...") embeddings, known_words = util.load_embeddings(word_indices=word_indices, dimension=dimension, embedding_path=embedding_path, weight_embeddings=weight_embeddings) logging.info("Vectorizing data ...") # Vectorize and assemble the training data headline_vectors = util.vectorize(fnc_data_train.headlines, word_indices, known_words, max_headline_len) body_vectors = util.vectorize(fnc_data_train.bodies, word_indices, known_words, max_body_len) headlines_pc = bodies_pc = None if config.method == "arora": headlines_pc = util.arora_embeddings_pc(headline_vectors, embeddings) bodies_pc = util.arora_embeddings_pc(body_vectors, embeddings) else: headlines_pc = None bodies_pc = None if config.method == "vanilla_bag_of_words": logging.info("Precomputing training sentence embeddings ...") train_emb = embeddings if idf: train_emb = util.idf_embeddings(word_indices, headline_vectors + body_vectors, train_emb) headlines_emb = util.sentence_embeddings(headline_vectors, dimension, max_headline_len, train_emb) bodies_emb = util.sentence_embeddings(body_vectors, dimension, max_body_len, train_emb) training_data = [headlines_emb, bodies_emb, fnc_data_train.stances] else: training_data = [headline_vectors, body_vectors, fnc_data_train.stances] if similarity_metric_feature: training_data.append(fnc_data_train.sim_scores) training_data = zip(*training_data) # Vectorize and assemble the dev data; note that we use the training # maximum length dev_headline_vectors = util.vectorize(fnc_data_dev.headlines, word_indices, known_words, max_headline_len) dev_body_vectors = util.vectorize(fnc_data_dev.bodies, word_indices, known_words, max_body_len) if config.method == "vanilla_bag_of_words": logging.info("Precomputing dev sentence embeddings ...") test_emb = embeddings if idf: # TODO(akshayka): Experiment with using whole corpus as # documents vs just training vs just testing test_emb = util.idf_embeddings(word_indices, headline_vecotrs + dev_headline_vectors + body_vectors + dev_body_vectors, test_emb) dev_headlines_emb = util.sentence_embeddings(dev_headline_vectors, dimension, max_headline_len, test_emb) dev_bodies_emb = util.sentence_embeddings(dev_body_vectors, dimension, max_body_len, test_emb) dev_data = [dev_headlines_emb, dev_bodies_emb, fnc_data_dev.stances] else: dev_data = [dev_headline_vectors, dev_body_vectors, fnc_data_dev.stances] if similarity_metric_feature: dev_data.append(fnc_data_dev.sim_scores) dev_data = zip(*dev_data) with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = FNCModel(config, max_headline_len, max_body_len, embeddings, headlines_pc=headlines_pc, bodies_pc=bodies_pc, verbose=verbose) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) logging.info('Fitting ...') model.fit(session, saver, training_data, dev_data) logging.info('Outputting ...') output = model.output(session, dev_data) indices_to_words = {word_indices[w] : w for w in word_indices} # TODO(akshayka): Please code-review this. In particular, # please validate whether dev_headline_vectors is an equivalent # representation of output[0][0], and dev_body_vectors for output[0][1] headlines = [' '.join( util.word_indices_to_words(h, indices_to_words)) for h in dev_headline_vectors] bodies = [' '.join( util.word_indices_to_words(b, indices_to_words)) for b in dev_body_vectors] output = zip(headlines, bodies, output[1], output[2]) with open(model.config.eval_output, 'w') as f, open( model.config.error_output, "w") as g: for headline, body, label, prediction in output: f.write("%s\t%s\tgold:%d\tpred:%d\n\n" % ( headline, body, label, prediction)) if label != prediction: g.write("%s\t%s\tgold:%d\tpred:%d\n\n" % ( headline, body, label, prediction))
def _add_seq2seq(self): """Add the whole sequence-to-sequence model to the graph.""" hps = self._hps vsize = self._vocab.size() # size of the vocabulary with tf.variable_scope('seq2seq'): # Some initializers self.rand_unif_init = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) self.trunc_norm_init = tf.truncated_normal_initializer( stddev=hps.trunc_norm_init_std) with tf.variable_scope('embedding'): if hps.pretrained_embeddings: word2vec = load_embeddings(hps.embeddings_path, self._vocab.word2id, hps.rand_unif_init_mag) self.embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.constant_initializer(word2vec)) # self.assign_embedding = tf.assign(self.embedding, word2vec) else: self.embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init) if hps.mode == "train": self._add_emb_vis(self.embedding) # add to tensorboard # tensor with shape (batch_size, max_enc_steps, emb_size) emb_enc_inputs = tf.nn.embedding_lookup( self.embedding, self._enc_batch) if self._hps.hier: enc_batch_sections = tf.unstack(self._enc_batch_sections, axis=1) sec_emb_enc_inputs = [ tf.nn.embedding_lookup(self.embedding, section) for section in enc_batch_sections ] # list length max_dec_steps containing shape (batch_size, emb_size) emb_dec_inputs = [ tf.nn.embedding_lookup(self.embedding, x) for x in tf.unstack(self._dec_batch, axis=1) ] # Hierarchical attention model if self._hps.hier: with tf.variable_scope('encoder'), tf.device( self._next_device()): sec_enc_outs = [] states_fw = [] states_bw = [] states = [] # level 1, encode words to sections with tf.variable_scope("word_level_encoder", reuse=tf.AUTO_REUSE) as scope: encoder_outputs_words = [] cell_fw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) fw_st, bw_st = None, None if self._hps.use_do: # DropOut cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=1.0 - self._hps.do_prob) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=1.0 - self._hps.do_prob) for i in range(self._hps.num_sections): encoder_tmp_output, ( fw_st, bw_st ) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=sec_emb_enc_inputs[i], dtype=tf.float32, sequence_length=self._batch_sections_len[:, i], swap_memory=True, initial_state_bw=bw_st, initial_state_fw=fw_st) # concatenate the forwards and backwards states encoder_tmp_output = tf.concat( axis=2, values=encoder_tmp_output ) #shape=[batch x seq_len x hidden_size] encoder_outputs_words.append(encoder_tmp_output) # instead of concating the fw and bw states, we use a ff network combined_state = self._reduce_states(fw_st, bw_st) states.append(combined_state) scope.reuse_variables() # level 2, encode sections to doc encoder_outputs_words = tf.stack( encoder_outputs_words, axis=1 ) # shape [batch x num_sections x seq_len x hidden_size] shapes = encoder_outputs_words.shape encoder_outputs_words = tf.reshape( encoder_outputs_words, (shapes[0].value, -1, shapes[-1].value) ) #shape=[batch x (seq_len * num_sections) x hidden_size] doc_sections_h = tf.stack( [s.h for s in states], axis=1) # [batch x num_sections x hidden_size] doc_sections_c = tf.stack( [s.c for s in states], axis=1) # [batch x num_sections x hidden_size] with tf.variable_scope("section_level_encoder"): if FLAGS.section_level_encoder == 'RNN': cell_fw_1 = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) cell_bw_1 = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) if self._hps.use_do: cell_fw_1 = tf.contrib.rnn.DropoutWrapper( cell_fw_1, output_keep_prob=1.0 - self._hps.do_prob) cell_bw_1 = tf.contrib.rnn.DropoutWrapper( cell_bw_1, output_keep_prob=1.0 - self._hps.do_prob) encoder_output_sections, (fw_st_2, bw_st_2) =\ tf.nn.bidirectional_dynamic_rnn(cell_fw_1, cell_bw_1, inputs=doc_sections_h, sequence_length=self._doc_sec_lens, dtype=tf.float32, swap_memory=True) encoder_output_sections = tf.concat( axis=2, values=encoder_output_sections) doc_sections_state = self._reduce_states( fw_st_2, bw_st_2) else: if FLAGS.section_level_encoder == 'AVG': # average section cells doc_sections_state_h = tf.reduce_mean( doc_sections_h, axis=1) doc_sections_state_c = tf.reduce_mean( doc_sections_c, axis=1) elif FLAGS.section_level_encoder == 'FF': # use a feedforward network to combine section cells doc_sections_state_h = tf.reshape( [doc_sections_h.shape[0].eval(), -1]) doc_sections_state_h = tf.layers.dense( inputs=doc_sections_state_h, units=self._hps.hidden, activation=tf.nn.relu) doc_sections_state_c = tf.reshape( [doc_sections_c.shape[0].eval(), -1]) doc_sections_state_c = tf.layers.dense( inputs=doc_sections_state_c, units=self._hps.hidden, activation=tf.nn.relu) else: raise AttributeError( 'FLAGS.section_level_encoder={} is not a valid option' .format(FLAGS.section_level_encoder)) doc_sections_state = tf.contrib.rnn.LSTMStateTuple( doc_sections_state_c, doc_sections_state_h) encoder_output_sections = doc_sections_h elif not self._hps.multi_layer_encoder: with tf.variable_scope('encoder'): with tf.variable_scope('word_level_encoder'): cell_fw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) (encoder_outputs, (fw_st, bw_st)) =\ tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=emb_enc_inputs, dtype=tf.float32, sequence_length=self._enc_lens, swap_memory=True) # concatenate the forwards and backwards states encoder_outputs = tf.concat(axis=2, values=encoder_outputs) # stack n layers of lstms for encoder elif self._hps.multi_layer_encoder: # TODO: check for layer_i in xrange(self._hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell( self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True) if self._hps.use_do: # add dropout cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=1.0 - self._hps.do_prob) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=1.0 - self._hps.do_prob) emb_enc_inputs, (fw_st, bw_st) =\ tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=emb_enc_inputs, dtype=tf.float32, sequence_length=self._enc_lens, swap_memory=True) emb_enc_inputs = tf.concat(axis=2, values=emb_enc_inputs) encoder_outputs = emb_enc_inputs if self._hps.hier: self._enc_sec_states = encoder_output_sections self._enc_states = encoder_outputs_words else: self._enc_states = encoder_outputs self._enc_sec_states = None # convert the encoder bidirectional hidden state to the decoder state # (unidirectional) by an MLP if self._hps.hier: self._dec_in_state = doc_sections_state else: with tf.variable_scope('encoder'): with tf.variable_scope('word_level_encoder'): self._dec_in_state = self._reduce_states(fw_st, bw_st) # Add the decoder with tf.variable_scope('decoder'), tf.device(self._next_device()): cell = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim, state_is_tuple=True, initializer=self.rand_unif_init) # We need to pass in the previous step's coverage vector each time prev_coverage = self.prev_coverage\ if hps.mode=="decode" and self._hps.coverage \ else None if self._hps.hier: decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage, self.attn_dists_sec =\ self.attn_decoder(emb_dec_inputs, self._dec_in_state, self._enc_states, cell, self._enc_sec_states, num_words_section=self._batch_sections_len, enc_padding_mask=self._enc_padding_mask, enc_section_padding_mask=self._enc_section_padding_mask, initial_state_attention=(self._hps.mode=="decode"), pointer_gen=self._hps.pointer_gen, use_coverage=self._hps.coverage, prev_coverage=prev_coverage, temperature=self._hps.temperature ) else: decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage, _ =\ self.attn_decoder(emb_dec_inputs, self._dec_in_state, self._enc_states, cell, encoder_section_states=None, num_words_section=None, enc_padding_mask=self._enc_padding_mask, initial_state_attention=(self._hps.mode=="decode"), pointer_gen=self._hps.pointer_gen, use_coverage=self._hps.coverage, prev_coverage=prev_coverage, ) # Project decoder output to vocabulary with tf.variable_scope('output_projection'), tf.device( self._next_device()): if self._hps.output_weight_sharing: # share weights of embedding layer with projection # self.embedding is in shape [vsize, hps.emb_dim] w_proj = tf.get_variable( 'w_proj', [self._hps.emb_dim, self._hps.hidden_dim], dtype=tf.float32, initializer=self.trunc_norm_init) w = tf.tanh(tf.transpose( tf.matmul(self.embedding, w_proj))) # shape = [vsize, hps.hidden_dim] # w_t = tf.transpose(w) b = tf.get_variable('b', [vsize], dtype=tf.float32, initializer=self.trunc_norm_init) else: w = tf.get_variable('w', [self._hps.hidden_dim, vsize], dtype=tf.float32, initializer=self.trunc_norm_init) # w_t = tf.transpose(w) b = tf.get_variable('b', [vsize], dtype=tf.float32, initializer=self.trunc_norm_init) # vocabulary score at each decoder step vocab_scores = [] for i, output in enumerate(decoder_outputs): if i > 0: tf.get_variable_scope().reuse_variables() vocab_scores.append(tf.nn.xw_plus_b( output, w, b)) # apply the linear layer # the final vocab distribution for each decoder time step # shape of each element is [batch_size, vsize] vocab_dists = [tf.nn.softmax(s) for s in vocab_scores] # pointing / generating if FLAGS.pointer_gen: final_dists = self._calc_final_dist(vocab_dists, self.attn_dists) # log_dists = [tf.log(dist) for dist in final_dists] else: # log_dists = [tf.log(dist) for dist in vocab_dists] final_dists = vocab_dists # Calculate Losses: if self._hps.mode in ['train', 'eval']: # Calculate the loss with tf.variable_scope('loss'), tf.device(self._next_device()): if FLAGS.pointer_gen: # Calculate the loss per step # This is fiddly; we use tf.gather_nd to pick out the gold target words # will be list length max_dec_steps containing shape (batch_size) loss_per_step = [] batch_nums = tf.range( 0, limit=hps.batch_size) # shape (batch_size) for dec_step, dist in enumerate(final_dists): # The indices of the target words. shape (batch_size) targets = self._target_batch[:, dec_step] indices = tf.stack((batch_nums, targets), axis=1) # shape (batch_size, 2) # shape (batch_size). loss on this step for each batch gold_probs = tf.gather_nd(dist, indices) losses = -tf.log(gold_probs) loss_per_step.append(losses) # Apply dec_padding_mask mask and get loss self._loss = _mask_and_avg(loss_per_step, self._dec_padding_mask) else: # baseline model # this applies softmax internally self._loss = tf.contrib.seq2seq.sequence_loss( tf.stack(vocab_scores, axis=1), self._target_batch, self._dec_padding_mask ) # this applies softmax internally tf.summary.scalar('loss', self._loss) # Calculate coverage loss from the attention distributions if self._hps.coverage: with tf.variable_scope('coverage_loss'): self._coverage_loss = _coverage_loss( self.attn_dists, self._dec_padding_mask) tf.summary.scalar('coverage_loss', self._coverage_loss) self._total_loss = self._loss + self._hps.cov_loss_wt * self._coverage_loss tf.summary.scalar('total_loss', self._total_loss) # ---------------------------/ if self._hps.mode == "decode": assert len( final_dists ) == 1 # final_dists is a singleton list containing shape (batch_size, extended_vsize) final_dists = final_dists[0] topk_probs, self._topk_ids = tf.nn.top_k( final_dists, hps.batch_size * 2 ) # take the k largest probs. note batch_size=beam_size in decode mode self._topk_log_probs = tf.log(topk_probs)