class Evaluate(object): def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) self.model_file_path = model_file_path time.sleep(5) self.model = Model(model_file_path, is_eval=True) def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) with torch.no_grad(): encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.item() def run_eval(self): batch = self.batcher.next_batch() loss_list = [] while batch is not None: loss = self.eval_one_batch(batch) loss_list.append(loss) batch = self.batcher.next_batch() return np.mean(loss_list)
def main(): args = get_args() vocab = Vocab(args.vocab_path, args.vocab_size) # create a vocabulary hps = get_hps() if not args.data_path == "": batcher = Batcher(args.data_path, vocab, hps, args.single_pass) import pdb pdb.set_trace() x = batcher.next_batch() import pdb pdb.set_trace() pass else: with open(args.json_path) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) import pdb pdb.set_trace() pass
def fit_tfidf_vectorizer(hps, vocab): if not os.path.exists( os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer')): os.makedirs(os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer')) decode_model_hps = hps._replace(max_dec_steps=1, batch_size=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries batcher = Batcher(FLAGS.data_path, vocab, decode_model_hps, single_pass=FLAGS.single_pass) all_sentences = [] while True: batch = batcher.next_batch() # 1 example repeated across batch if batch is None: # finished decoding dataset in single_pass mode break all_sentences.extend(batch.raw_article_sents[0]) stemmer = PorterStemmer() class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) tfidf_vectorizer = StemmedTfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_df=0.7) tfidf_vectorizer.fit_transform(all_sentences) return tfidf_vectorizer
def train_generator(args, load_recent=True): '''Train the generator via classical approach''' logging.debug('Batcher...') batcher = Batcher(args.data_dir, args.batch_size, args.seq_length) logging.debug('Vocabulary...') with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'), 'w') as f: cPickle.dump((batcher.chars, batcher.vocab), f) logging.debug('Creating generator...') generator = Generator(args, is_training=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) if load_recent: ckpt = tf.train.get_checkpoint_state(args.save_dir_gen) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) for epoch in xrange(args.num_epochs): # Anneal learning rate new_lr = args.learning_rate * (args.decay_rate**epoch) sess.run(tf.assign(generator.lr, new_lr)) batcher.reset_batch_pointer() state = generator.initial_state.eval() for batch in xrange(batcher.num_batches): start = time.time() x, y = batcher.next_batch() feed = { generator.input_data: x, generator.targets: y, generator.initial_state: state } # train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed) train_loss, _ = sess.run([generator.cost, generator.train_op], feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \ .format(epoch * batcher.num_batches + batch, args.num_epochs * batcher.num_batches, epoch, train_loss, end - start) if (epoch * batcher.num_batches + batch) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir_gen, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=epoch * batcher.num_batches + batch) print 'Generator model saved to {}'.format(checkpoint_path)
def test_batcher(): batcher = Batcher(hps.data_path, vocab, hps, hps.single_pass) #batcher = newbatcher(vocab, hps, hps.data_path, hps.single_pass) #time.sleep(15) while True: start = time.time() #batch = next(batcher)#.next_batch() batch = batcher.next_batch() print('elapse:', time.time() - start)
def train(params): data_loader = Batcher(params) params.vocab_size = data_loader.vocab_size if not os.path.isdir(params.save_dir): os.makedirs(params.save_dir) with open(os.path.join(params.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(params, f) with open(os.path.join(params.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(params) with tf.Session() as sess: summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(params.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) for e in range(params.num_epochs): sess.run(tf.assign(model.lr, params.learning_rate * (0.97**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) summ, train_loss, state, _ = sess.run( [summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) end = time.time() logging.info( "Epoch #{e} / Batch #{b} -- Loss {train_loss:.3f} " "Time {time_diff:.3f}".format(e=e, b=b, train_loss=train_loss, time_diff=end - start)) if e % params.save_every == 0 or e == params.num_epochs - 1: checkpoint_path = os.path.join(params.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e)
def generate_batch(self, mode): #mode: train/test/val hps = self._hps hps['mode'] = mode batcher = Batcher(hps['data_path'] + '/{}.bin'.format(mode), self._vocab, hps, single_pass=True) while True: batch = batcher.next_batch() feed_dict = self.make_feed_dict(batch) yield [feed_dict['enc_batch'], feed_dict['dec_batch']], feed_dict['target_batch']
def get_decode_results(sess, model, vocab, hps, data_path): eval_batcher = Batcher(data_path, vocab, hps, True) total_loss = 0.0 total_correct_preds = 0.0 predictions = np.array([]) original_comments = [] gold_labels = [] attention_scores = [] labelvalues = np.array(["male", "female"]) predicted_labels = [] probabilities = np.array([]) n=0 while True: try: eval_batch = eval_batcher.next_batch() if eval_batch is None: break eval_results = model.run_eval_step(sess, eval_batch) batch = eval_batch batch_size = FLAGS.batch_size loss = eval_results['loss'] correct_predictions = eval_results['correct_predictions'] predictions = eval_results['predictions'] predicted_labels = np.concatenate((predicted_labels, labelvalues[predictions])) # print eval_results['probs'] # print eval_results['batch'] # print batch.enc_batch[0] # print batch.enc_batch[1] # print batch.enc_batch[2] # raw_input() probabilities = np.concatenate((probabilities, eval_results['probs'])) gold_labels += batch.original_labels original_comments += batch.original_comments attention_scores += list(eval_results['attention_scores']) total_loss += loss*batch_size total_correct_preds += correct_predictions n+=batch_size except StopIteration: break eval_loss = total_loss/n accuracy = total_correct_preds/n return eval_loss, accuracy, original_comments, gold_labels, predicted_labels, attention_scores, np.array(probabilities, dtype=str)
def train_generator(args, load_recent=True): '''Train the generator via classical approach''' logging.debug('Batcher...') batcher = Batcher(args.data_dir, args.batch_size, args.seq_length) logging.debug('Vocabulary...') with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'), 'w') as f: cPickle.dump((batcher.chars, batcher.vocab), f) logging.debug('Creating generator...') generator = Generator(args, is_training = True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) if load_recent: ckpt = tf.train.get_checkpoint_state(args.save_dir_gen) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) for epoch in xrange(args.num_epochs): # Anneal learning rate new_lr = args.learning_rate * (args.decay_rate ** epoch) sess.run(tf.assign(generator.lr, new_lr)) batcher.reset_batch_pointer() state = generator.initial_state.eval() for batch in xrange(batcher.num_batches): start = time.time() x, y = batcher.next_batch() feed = {generator.input_data: x, generator.targets: y, generator.initial_state: state} # train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed) train_loss, _ = sess.run([generator.cost, generator.train_op], feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \ .format(epoch * batcher.num_batches + batch, args.num_epochs * batcher.num_batches, epoch, train_loss, end - start) if (epoch * batcher.num_batches + batch) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir_gen, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = epoch * batcher.num_batches + batch) print 'Generator model saved to {}'.format(checkpoint_path)
def get_eval_loss(sess, model, vocab, hps, data_path): eval_batcher = Batcher(data_path, vocab, hps, True) total_loss = 0.0 total_ce_loss = 0.0 total_correct_preds = 0.0 preds = [] truey = [] n=0 if FLAGS.mode == 'decode': pass while True: try: eval_batch = eval_batcher.next_batch() if eval_batch is None: break eval_results = model.run_eval_step(sess, eval_batch) batch_size = FLAGS.batch_size loss = eval_results['loss'] ce_loss = eval_results['ce_loss'] correct_predictions = eval_results['correct_predictions'] predictions = eval_results['predictions'] true_labels = eval_batch.labels preds += list(predictions) truey += list(true_labels) total_loss += loss*batch_size total_ce_loss += ce_loss*batch_size total_correct_preds += correct_predictions n+=batch_size except StopIteration: break eval_loss = total_loss/n eval_ce_loss = total_ce_loss/n accuracy = total_correct_preds/n print " Precision Score:", precision_score(truey, preds), print " Recall Score:", recall_score(truey, preds), print " F1 Score:", f1_score(truey, preds) print n return eval_loss, eval_ce_loss, accuracy
def train(self, images, labels, load_model=True): train_log_dir = self.log_dir if not tf.gfile.Exists(train_log_dir): tf.gfile.MakeDirs(train_log_dir) with tf.Graph().as_default() as graph: # image_batch, label_batch = utils.get_batch_data(images, labels, batch_size=self.batch_size) inputs = tf.placeholder(dtype=tf.float32, shape=(self.batch_size, self.width, self.height, 1), name="inputs") ouputs = tf.placeholder(dtype=tf.float32, shape=(self.batch_size, 1, 1, self.num_classes), name="outputs") predictions = self.build_vgg16(inputs) # Specify the loss function: tf.losses.softmax_cross_entropy(ouputs, predictions) total_loss = tf.losses.get_total_loss() tf.summary.scalar('losses/total_loss', total_loss) # Specify the optimization scheme: optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.learning_rate) # create_train_op that ensures that when we evaluate it to get the loss, # the update_ops are done and the gradient updates are computed. train_tensor = slim.learning.create_train_op(total_loss, optimizer) tf.logging.set_verbosity(tf.logging.INFO) best_loss = None # prepare saver = tf.train.Saver() sv = tf.train.Supervisor( logdir=self.log_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs= 60, # save summaries for tensorboard every 60 secs save_model_secs=60) # checkpoint every 60 secs) summary_writer = sv.summary_writer tf.logging.info("Preparing or waiting for session...") sess_context_manager = sv.prepare_or_wait_for_session( config=utils.get_config()) tf.logging.info("Created session.") # Actually runs training. with sess_context_manager as sess: batcher = Batcher(images, labels, self.batch_size) epoch = 0 turn = 0 total_turn = 0 while (True): real_images, real_labels, finised = batcher.next_batch() if finised: epoch += 1 turn = 0 real_labels = np.eye(self.num_classes)[real_labels] real_labels = np.reshape( real_labels, [real_labels.shape[0], 1, 1, real_labels.shape[1]]) feed_dict = { "inputs:0": real_images, "outputs:0": real_labels } _, loss, r = sess.run( [train_tensor, total_loss, predictions], feed_dict) turn += 1 total_turn += 1 if turn % 100 == 0: tf.logging.info("epch: %d\tturn: %d/%d" % (epoch, turn, batcher.batch_count)) tf.logging.info("total loss: %f" % loss) summary_writer.flush()
class BeamSearch(object): def __init__(self, model_file_path, data_path, data_class='val'): self.data_class = data_class if self.data_class not in ['val', 'test']: print("data_class must be 'val' or 'test'.") raise ValueError # model_file_path e.g. --> ../log/{MODE NAME}/best_model/model_best_XXXXX model_name = os.path.basename(model_file_path) # log_root e.g. --> ../log/{MODE NAME}/ log_root = os.path.dirname(os.path.dirname(model_file_path)) # _decode_dir e.g. --> ../log/{MODE NAME}/decode_model_best_XXXXX/ self._decode_dir = os.path.join(log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') self._result_path = os.path.join(self._decode_dir, 'result_%s_%s.txt' \ % (model_name, self.data_class)) # remove result file if exist if os.path.isfile(self._result_path): os.remove(self._result_path) for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(5) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def beam_search(self, batch): # batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2H dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context = c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size)] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h =[] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len(results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0] def decode(self): start = time.time() counter = 0 bleu_scores = [] batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words(output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_articles = batch.original_articles[0] original_abstracts = batch.original_abstracts_sents[0] reference = original_abstracts[0].strip().split() bleu = nltk.translate.bleu_score.sentence_bleu([reference], decoded_words, weights = (0.5, 0.5)) bleu_scores.append(bleu) # write_for_rouge(original_abstracts, decoded_words, counter, # self._rouge_ref_dir, self._rouge_dec_dir) write_for_result(original_articles, original_abstracts, decoded_words, \ self._result_path, self.data_class) counter += 1 if counter % 1000 == 0: print('%d example in %d sec'%(counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() ''' # uncomment this if you successfully install `pyrouge` print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) ''' if self.data_class == 'val': print('Average BLEU score:', np.mean(bleu_scores)) with open(self._result_path, "a") as f: print('Average BLEU score:', np.mean(bleu_scores), file=f) def get_processed_path(self): # ../log/{MODE NAME}/decode_model_best_XXXXX/result_model_best_2800_{data_class}.txt input_path = self._result_path temp = os.path.splitext(input_path) # ../log/{MODE NAME}/decode_model_best_XXXXX/result_model_best_2800_{data_class}_processed.txt output_path = temp[0] + "_processed" + temp[1] return input_path, output_path
from data import Vocab from batcher import Batcher import config import data import os FLAGS = config.FLAGS vocab_in, vocab_out = data.load_dict_data(FLAGS) batcher_train = Batcher(FLAGS.data_path, vocab_in, vocab_out, FLAGS, data_file='train.txt.tags') epoch = 0 while True: print(epoch) while batcher_train.c_epoch == epoch: batch = batcher_train.next_batch() epoch += 1 print("done")
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, batch_size=config.batch_size) train_dir = os.path.join(config.log_root) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) def save_model(self, loss, iter_step, name=None): state = { 'iter': iter_step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': loss } if name is None: name = 'model_{}_{}'.format(iter_step, loss) model_save_path = os.path.join(self.model_dir, name) torch.save(state, model_save_path) print('saved loss:', loss) print('******************') #print('\n') def setup_train(self, model_file_path=None): # 初始化模型 self.model = Model(model_file_path) # 模型参数的列表 params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) # 定义优化器 self.optimizer = optim.Adam(params, lr=config.adam_lr) #self.optimizer = optim.Adagrad(params, lr=0.15, initial_accumulator_value=0.1, eps=1e-10) # 初始化迭代次数和损失 start_iter, start_loss = 0, 0 # 如果传入的已存在的模型路径,加载模型继续训练 if model_file_path is not None: print('loading saved model:', model_file_path) state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if USE_CUDA: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(DEVICE) return start_iter, start_loss def train_one_batch(self, batch): # enc_batch是包含unk的序列 # c_t_1是初始上下文向量 # extra_zeros:oov词汇表概率,[batch_size, batch.max_art_oovs] enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) # dec_batch是普通摘要序列,包含unk,target_batch是目标词序列,不包含unk,unk的词用len(vocabe)+oov相对位置代替 dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() # [batch, seq_lens, 2*hid_dim],[batch*max(seq_lens), 2*hid_dim],[2, batch, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) # (h,c) = ([1, batch, hid_dim], [1, batch, hid_dim]) # 之前的hidden state是双向的[2, batch, hid_dim],需要转成1维的[1, batch, hid_dim],作为新的decoder的hidden输入 s_t_1 = self.model.reduce_state(encoder_hidden) # h,c step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 y_t_1 = dec_batch[:, di] # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的vocab size final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # 摘要的下一个单词的编码,[B] target = target_batch[:, di] # [B,1] target_i = target.unsqueeze(1) # 取出目标单词的概率//取出final_dist中,target中对应位置的数据(对于目标单词预测的概率) gold_probs = torch.gather(final_dist, 1, target_i).squeeze() #print(gold_probs) # if gold_probs <= 0: # print('*******loss less than 0 ***********') # gold_probs = 1e-2 # print('pro has been modified', gold_probs) # print('\n') # 单个词的预测损失 # 加入绝对值 step_loss = -torch.log(torch.abs(gold_probs) + 1e-8) #print('') if config.is_coverage: # 取当前t步attention向量,和之前t-1步attention和向量,的min值做sum,当作额外的coverage loss来压制重复生成。 # 迫使loss让当前第t步的attention向量attn_dist值,尽可能比之前t-1步attention和向量的值小。(大的的attention值意味着之前可能被预测生成了这个词) step_coverage_loss = torch.sum( torch.min(torch.abs(attn_dist), torch.abs(coverage)), 1) #print('step_coverage_loss is ', step_coverage_loss) # 加个\lambda 系数,表示多大程度考虑这个压制重复的coverage loss step_loss = step_loss + config.cov_loss_wt * torch.abs( step_coverage_loss) # 初始时的coverage覆盖向量,就更新成累加了 coverage = next_coverage # mask的部位不计入损失 step_mask = dec_padding_mask[:, di] step_loss = torch.abs(step_loss) * torch.abs(step_mask) step_losses.append(step_loss) sum_losses = torch.abs(torch.sum(torch.stack(step_losses, 1), 1)) # print('sum_losses is ',sum_losses) # 序列的整体损失 # print('dec_lens_var is ', dec_lens_var) batch_avg_loss = sum_losses / (torch.abs(dec_lens_var) + 1) # 整个batch的整体损失 loss = torch.mean(batch_avg_loss) #print('loss from one_batch is ', loss) loss.backward() # self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) # clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) # clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, model_file_path=None): # 训练设置,包括 iter_data, loss = self.setup_train(model_file_path) start = time.time() # 总数据量data_size,轮回训练iter_loop次数据 data_size = 80000 i = 0 min_loss = 10000 cum_loss = 0 while iter_data < data_size * config.iter_loop: # 获取下一个batch数据 batch = self.batcher.next_batch() iter_data += batch.batch_size loss = self.train_one_batch(batch) cum_loss += loss i += 1 if i % 10 == 0: #print('loss of one batch is', loss) avg_loss = cum_loss / 10 print('cum_loss over 10 batch:', cum_loss) print('steps %d, seconds for %d ,' % (i, time.time() - start)) print('avg_loss over 10 batch:', avg_loss) start = time.time() cum_loss = 0 # > 100 就开始存储,因为可能是重新加载的 if avg_loss < min_loss and i > 100: min_loss = avg_loss self.save_model(avg_loss, i, name='best_model2')
tokens = [t if t != '\s' else ' ' for t in chars] tokens = ''.join(tokens) print(tokens) if __name__ == "__main__": message_start = "Select the run mode for the NN:\n\t1. Greedy Search\n\t2. Beam Search\n-> " mode = input(message_start) message_iterations = "Specify the numer of iterations (min 2500, is suggested more) -> " max_iterations = input(message_iterations) iteration = 0 while iteration < int(max_iterations) + 1: batch = batcher.next_batch() model.train_on_batch(batch.input, batch.target) if iteration % 500 == 0: print('Names generated after iteration {}:'.format(iteration)) if int(mode) == 1: for i in range(3): make_name(model, vocab, hps) else: # il "for i in range(3)" funziona solo col greedy, perché il greedy # è non deterministico, con il beam ti stampa 3 nomi uguali perché il # primo è sempre il più probabile make_name_beam(model, vocab, hps) print("")
fd = fd.cuda() final_lists.append(fd) attn_lists = [] for j in range(max_dec_steps): ad = Variable(torch.rand(batch_size, attn_len)) if use_cuda: ad = ad.cuda() attn_lists.append(ad) return final_lists, attn_lists final_dists, attn_dists = forward(attn_len=args.max_enc_steps, max_dec_steps=args.max_dec_steps, batch_size=args.batch_size, extended_vsize=args.extended_vsize, use_cuda=False, mode=args.mode, pointer_gen=args.pointer_gen, use_coverage=args.coverage) for n in range(args.num_steps): print("lalallalalallalalallalalallalallalla:", n) batch = dataloader.next_batch() batch = batch2var(batch, use_cuda=False) loss = loss_function(final_dists, attn_dists, batch) print(loss) log('loss', loss.data[0], step=n) if n % args.check_n == 0: save_checkpoint({'step': n + 1}, is_best=False) if n > 20: break
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(5) if not os.path.exists(config.log_root): os.makedirs(config.log_root) self.model_dir = os.path.join(config.log_root, 'train_model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.eval_log = os.path.join(config.log_root, 'eval_log') if not os.path.exists(self.eval_log): os.mkdir(self.eval_log) self.summary_writer = tf.compat.v1.summary.FileWriter(self.eval_log) def save_model(self, running_avg_loss, iter, mode): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } if mode == 'train': save_model_dir = self.model_dir else: best_model_dir = os.path.join(config.log_root, 'best_model') if not os.path.exists(best_model_dir): os.mkdir(best_model_dir) save_model_dir = best_model_dir if len(os.listdir(save_model_dir)) > 0: shutil.rmtree(save_model_dir) time.sleep(2) os.mkdir(save_model_dir) train_model_path = os.path.join(save_model_dir, 'model_best_%d' % (iter)) torch.save(state, train_model_path) return train_model_path def setup_train(self, model_file_path=None, emb_v_path=None, emb_list_path=None, vocab=None, log=None): self.model = Model(model_file_path) if model_file_path is None: set_embedding(self.model, emb_v_path=emb_v_path, emb_list_path=emb_list_path, vocab=self.vocab, use_cuda=use_cuda, log=log) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr if config.mode == 'MLE': self.optimizer = Adagrad(params, lr=0.15, initial_accumulator_value=0.1) else: self.optimizer = Adam(params, lr=initial_lr) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] return start_iter, start_loss def train_one_batch(self, batch, alpha, beta): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) nll_list = [] gen_summary = torch.LongTensor( config.batch_size * [config.sample_size * [[2]]]) # B x S x 1 if use_cuda: gen_summary = gen_summary.cuda() preds_y = gen_summary.squeeze(2) # B x S for di in range(min(config.max_dec_steps, dec_batch.size(1))): # Select the current input word p1 = np.random.uniform() if p1 < alpha: # use ground truth word y_t_1 = dec_batch[:, di] else: # use decoded word y_t_1 = preds_y[:, 0] final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # Select the current output word p2 = np.random.uniform() if p2 < beta: # sample the ground truth word target = target_batch[:, di] sampled_batch = torch.stack(config.sample_size * [target], 1) # B x S else: # randomly sample a word with given probabilities sampled_batch = torch.multinomial(final_dist, config.sample_size, replacement=True) # B x S # Compute the NLL probs = torch.gather(final_dist, 1, sampled_batch).squeeze() step_nll = -torch.log(probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_nll = step_nll + config.cov_loss_wt * step_coverage_loss coverage = next_coverage nll_list.append(step_nll) # Store the decoded words in preds_y preds_y = gen_preds(sampled_batch, use_cuda) # Add the decoded words into gen_summary (mixed with ground truth and decoded words) gen_summary = torch.cat((gen_summary, preds_y.unsqueeze(2)), 2) # B x S x L # compute the REINFORCE score nll = torch.sum(torch.stack(nll_list, 2), 2) # B x S all_rewards, avg_reward = compute_reward(batch, gen_summary, self.vocab, config.mode, use_cuda) # B x S, 1 batch_loss = torch.sum(nll * all_rewards, dim=1) # B loss = torch.mean(batch_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item(), avg_reward.item() def trainIters(self, n_iters, model_file_path=None): if config.mode not in [ "MLE", "RL", "GTI", "SO", "SIO", "DAGGER", "DAGGER*" ]: print("\nTRAINING MODE ERROR\n") raise ValueError # log file path log_path = os.path.join(config.log_root, 'log') log = open(log_path, 'w') print_log("==============================", file=log) iter, running_avg_loss = self.setup_train( model_file_path, emb_v_path=config.emb_v_path, emb_list_path=config.vocab_path, vocab=self.vocab, log=log) min_val_loss = np.inf alpha = config.alpha beta = config.beta k1 = config.k1 k2 = config.k2 delay = iter # set to 0 in the original code (wyu-du) print("\nLog root is %s" % config.log_root) print_log("Train mode is %s" % config.mode, file=log) print_log("k1: %s, k2: %s" % (config.k1, config.k2), file=log) print_log("==============================", file=log) cur_time = time.time() while iter < n_iters: if config.mode == 'RL': alpha = 0. beta = 0. elif config.mode == 'GTI': alpha = 1. beta = 0. elif config.mode == 'SO': alpha = 1. beta = k2 / (k2 + np.exp((iter - delay) / k2)) elif config.mode == 'SIO': alpha *= k1 if alpha < 0.01: beta = k2 / (k2 + np.exp((iter - delay) / k2)) else: beta = 1. delay += 1 elif config.mode == 'DAGGER': alpha *= k1 beta = 1. elif config.mode == 'DAGGER*': alpha = config.alpha beta = 1. else: alpha = 1. beta = 1. batch = self.batcher.next_batch() loss, avg_reward = self.train_one_batch(batch, alpha, beta) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % config.print_interval == 0: print_log('steps %d, current_loss: %f, avg_reward: %f, alpha: %f, beta: %f, delay: %d' % \ (iter, loss, avg_reward, alpha, beta, delay), file=log) if iter % config.save_model_iter == 0: model_file_path = self.save_model(running_avg_loss, iter, mode='train') evl_model = Evaluate(model_file_path) val_avg_loss = evl_model.run_eval() if val_avg_loss < min_val_loss: min_val_loss = val_avg_loss best_model_file_path = self.save_model(running_avg_loss, iter, mode='eval') print_log('Save best model at %s' % best_model_file_path, file=log) print_log('steps %d, train_loss: %f, val_loss: %f, time: %ds' % \ (iter, loss, val_avg_loss, time.time()-cur_time), file=log) # write val_loss into tensorboard loss_sum = tf.compat.v1.Summary() loss_sum.value.add(tag='val_avg_loss', simple_value=val_avg_loss) self.summary_writer.add_summary(loss_sum, global_step=iter) self.summary_writer.flush() cur_time = time.time() log.close()
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_closs': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage, wr_attention = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di, wr_attention) target = target_batch[:, di] print(target) gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) print("loss: ", loss, "step:", iter) if (math.isnan(loss)): exit() running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) train_dir = os.path.join(config.log_root, 'train_{}'.format(stamp)) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter_step): """保存模型""" state = { 'iter': iter_step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) model_save_path = os.path.join(self.model_dir, 'model_{}_{}'.format(iter_step, stamp)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): """模型初始化或加载、初始化迭代次数、损失、优化器""" # 初始化模型 self.model = Model(model_file_path) # 模型参数的列表 params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) total_params = sum([param[0].nelement() for param in params]) print('The Number of params of model: %.3f million\n' % (total_params / 1e6)) # million # 定义优化器 # self.optimizer = optim.Adam(params, lr=config.adam_lr) # 使用AdagradCustom做优化器 initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = AdagradCustom(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) # 初始化迭代次数和损失 start_iter, start_loss = 0, 0 # 如果传入的已存在的模型路径,加载模型继续训练 if model_file_path is not None: state = torch.load(model_file_path, map_location = lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if USE_CUDA: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(DEVICE) return start_iter, start_loss def train_one_batch(self, batch): """ 训练一个batch,返回该batch的loss。 enc_batch: torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码; enc_padding_mask: torch.Size([16, 400]), 对应pad的位置为0,其余为1; enc_lens: numpy.ndarray, 列表内每个元素表示每篇article的单词数; enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码; extra_zeros: torch.Size([16, 文章oov词汇数量]) zero tensor; c_t_1: torch.Size([16, 512]) zero tensor; coverage: Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值 ---------------------------------------- dec_batch: torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD; dec_padding_mask: torch.Size([16, 100]) 对应pad的位置为0,其余为1; max_dec_len: 标量,摘要词语数量,不包含pad dec_lens_var: torch.Size([16] 摘要词汇数量 target_batch: torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD """ enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() """ # 记得修改Batch类添加vocab属性 print("模型输入文章编码:", "*"*100) print("enc_batch:", enc_batch, enc_batch.size()) print("enc_batch[-1]:", enc_batch[-1]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()]) print("-"*50) print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size()) print("-"*50) print("enc_lens:", enc_lens, enc_lens.shape) print("-"*50) print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size()) print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1]) print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()]) print("-"*50) print("extra_zeros:", extra_zeros, extra_zeros.size()) print("-"*50) print("c_t_1:", c_t_1, c_t_1.size()) print("-"*50) print("coverage:", coverage) print("*"*100) print("模型输入摘要编码,包括源和目标:", "*"*100) print("dec_batch:", dec_batch, dec_batch.size()) print("dec_batch[0]:", dec_batch[0]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()]) print("-"*50) print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size()) print("-"*50) print("max_dec_len:", max_dec_len) print("-"*50) print("dec_lens_var", dec_lens_var, dec_lens_var.size()) print("-"*50) print("target_batch:", target_batch, target_batch.size()) print("-"*50) print("target_batch[0]:", target_batch[0], target_batch[0].size()) print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()]) print("*"*100) input("任意键继续>>>") """ # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) # (h,c) = ([1, B, hid_dim], [1, B, hid_dim]) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 # print("y_t_1:", y_t_1, y_t_1.size()) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] # 摘要的下一个单词的编码 # print("target-iter:", target, target.size()) # print("final_dist:", final_dist, final_dist.size()) # input("go on>>") # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000 gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() # 取出目标单词的概率gold_probs step_loss = -torch.log(gold_probs + config.eps) # 最大化gold_probs,也就是最小化step_loss(添加负号) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses/dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): # 训练设置,包括 iter_step, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter_step < n_iters: # 获取下一个batch数据 batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter_step) iter_step += 1 if iter_step % 100 == 0: self.summary_writer.flush() # print_interval = 1000 if iter_step % 100 == 0: # lr = self.optimizer.state_dict()['param_groups'][0]['lr'] print('steps %d, seconds for %d steps: %.2f, loss: %f' % (iter_step, 100, time.time() - start, loss)) start = time.time() # 5000次迭代就保存一下模型 if iter_step % 1000 == 0: self.save_model(running_avg_loss, iter_step)
class Evaluate(object): def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True) def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data[0] def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() batch = self.batcher.next_batch()
out_bid = tf.nn.softmax(tf.matmul(output, softmax_w), name='out_bid') for i, next_i in enumerate(next_state): tf.identity(next_i.c, name='next_c_{}'.format(i)) tf.identity(next_i.h, name='next_h_{}'.format(i)) cost = tf.losses.softmax_cross_entropy(out_bid_target, out_bid_logit) train_step = tf.train.AdamOptimizer(0.001).minimize(cost) batch = Batcher(n_examples, batch_size) cost_batch = Batcher(n_examples, 10000) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=20) for i in range(n_iterations): x_batch, y_batch = batch.next_batch([X_train, y_train]) if i % display_step == 0: x_cost, y_cost = cost_batch.next_batch([X_train, y_train]) c_train = sess.run(cost, feed_dict={seq_in: x_cost, seq_out: y_cost, keep_prob: 1.0}) c_valid = sess.run(cost, feed_dict={seq_in: X_val, seq_out: y_val, keep_prob: 1.0}) print('{}. c_train={} c_valid={}'.format(i, c_train, c_valid)) sys.stdout.flush() saver.save(sess, model_path, global_step=i) sess.run(train_step, feed_dict={seq_in: x_batch, seq_out: y_batch, keep_prob: 0.8}) saver.save(sess, model_path, global_step=n_iterations)
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(5) # print("BATCH") # print(self.batcher) if not os.path.exists(config.log_root): os.mkdir(config.log_root) self.model_dir = os.path.join(config.log_root, 'train_model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.eval_log = os.path.join(config.log_root, 'eval_log') if not os.path.exists(self.eval_log): os.mkdir(self.eval_log) self.summary_writer = tf.compat.v1.summary.FileWriter(self.eval_log) def save_model(self, running_avg_loss, iter, mode): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } if mode == 'train': save_model_dir = self.model_dir else: best_model_dir = os.path.join(config.log_root, 'best_model') if not os.path.exists(best_model_dir): os.mkdir(best_model_dir) save_model_dir = best_model_dir if len(os.listdir(save_model_dir)) > 0: shutil.rmtree(save_model_dir) time.sleep(2) os.mkdir(save_model_dir) train_model_path = os.path.join(save_model_dir, 'model_best_%d' % (iter)) torch.save(state, train_model_path) return train_model_path def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr if config.mode == 'MLE': self.optimizer = Adagrad(params, lr=0.15, initial_accumulator_value=0.1) else: self.optimizer = Adam(params, lr=initial_lr) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] return start_iter, start_loss def train_one_batch(self, batch, alpha, beta): # # print("BATCH") # print(batch) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) # # print("ENC_BATCH") # print(len(enc_batch)) # print(len(enc_batch[0])) # print((enc_batch[0])) # # print("enc_padding_mask") # print(enc_padding_mask) # print(len(enc_padding_mask)) # print(len(enc_padding_mask[0])) # print("enc_lens") # print(enc_lens) # print("enc_batch_extend_vocab") # print(enc_batch_extend_vocab) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) # print("encoder_outputs") # print(encoder_outputs.siz) s_t_1 = self.model.reduce_state(encoder_hidden) nll_list = [] # sample_size 是啥? gen_summary = torch.LongTensor( config.batch_size * [config.sample_size * [[2]]]) # B x S x 1 # print("gen_summary") # print(gen_summary.size()) # print(gen_summary) if use_cuda: gen_summary = gen_summary.cuda() preds_y = gen_summary.squeeze(2) # B x S # TODO: Print Gold Here!!!! # print("preds_y") # print(preds_y.size()) # print(preds_y) # print(self.vocab.size()) # print("temp") # from data import outputids2words # temp = outputids2words(list(map(lambda x : x.item(), dec_batch[1])),self.vocab,None) # print(temp) # # for item in dec_batch[1]: # # temp = self.vocab.id2word(item.item()) # # from data import outputids2words(dec_batch[1]) # # print(temp) from data import outputids2words # print("dec_batch") # print(dec_batch[0]) # temp = outputids2words(list(map(lambda x : x.item(), dec_batch[0])),self.vocab,None) # print(temp) # print() # print("target_batch") # print(target_batch[0]) # temp = outputids2words(list(map(lambda x : x.item(), target_batch[0])),self.vocab,None) # print(temp) # print() for di in range(min(config.max_dec_steps, dec_batch.size(1))): # Select the current input word p1 = np.random.uniform() if p1 < alpha: # use ground truth word y_t_1 = dec_batch[:, di] else: # use decoded word y_t_1 = preds_y[:, 0] # print("y_t_1") # # print(y_t_1) # print("dec_batch") # print(dec_batch) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # Select the current output word p2 = np.random.uniform() if p2 < beta: # sample the ground truth word target = target_batch[:, di] sampled_batch = torch.stack(config.sample_size * [target], 1) # B x S else: # randomly sample a word with given probabilities sampled_batch = torch.multinomial(final_dist, config.sample_size, replacement=True) # B x S # Compute the NLL probs = torch.gather(final_dist, 1, sampled_batch).squeeze() step_nll = -torch.log(probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_nll = step_nll + config.cov_loss_wt * step_coverage_loss coverage = next_coverage nll_list.append(step_nll) # Store the decoded words in preds_y preds_y = gen_preds(sampled_batch, use_cuda) # Add the decoded words into gen_summary (mixed with ground truth and decoded words) gen_summary = torch.cat((gen_summary, preds_y.unsqueeze(2)), 2) # B x S x L # compute the REINFORCE score nll = torch.sum(torch.stack(nll_list, 2), 2) # B x S all_rewards, avg_reward = compute_reward(batch, gen_summary, self.vocab, config.mode, use_cuda) # B x S, 1 batch_loss = torch.sum(nll * all_rewards, dim=1) # B loss = torch.mean(batch_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item(), avg_reward.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) min_val_loss = np.inf alpha = config.alpha beta = config.beta k1 = config.k1 k2 = config.k2 delay = 0 while iter < n_iters: if config.mode == 'RL': alpha = 0. beta = 0. elif config.mode == 'GTI': alpha = 1. beta = 0. elif config.mode == 'SO': alpha = 1. beta = k2 / (k2 + np.exp((iter - delay) / k2)) elif config.mode == 'SIO': alpha *= k1 if alpha < 0.01: beta = k2 / (k2 + np.exp((iter - delay) / k2)) else: beta = 1. delay += 1 elif config.mode == 'DAGGER': alpha *= k1 beta = 1. elif config.mode == 'DAGGER*': alpha = config.alpha beta = 1. else: alpha = 1. beta = 1. batch = self.batcher.next_batch() loss, avg_reward = self.train_one_batch(batch, alpha, beta) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % config.print_interval == 0: print('steps %d, current_loss: %f, avg_reward: %f' % (iter, loss, avg_reward)) if iter % config.save_model_iter == 0: model_file_path = self.save_model(running_avg_loss, iter, mode='train') evl_model = Evaluate(model_file_path) val_avg_loss = evl_model.run_eval() if val_avg_loss < min_val_loss: min_val_loss = val_avg_loss best_model_file_path = self.save_model(running_avg_loss, iter, mode='eval') print('Save best model at %s' % best_model_file_path) print('steps %d, train_loss: %f, val_loss: %f' % (iter, loss, val_avg_loss)) # write val_loss into tensorboard loss_sum = tf.compat.v1.Summary() loss_sum.value.add(tag='val_avg_loss', simple_value=val_avg_loss) self.summary_writer.add_summary(loss_sum, global_step=iter) self.summary_writer.flush()
batch = Batcher(n_examples, batch_size) cost_train_batch = Batcher(n_examples, 10000) cost_val_batch = Batcher(100000, 10000) # run the session model_path = sys.argv[3] with tf.Session() as sess: sess.run(init) saver = tf.train.Saver(max_to_keep=100) for iteration in range(n_iterations // display_step): for i in range(display_step): x_batch, y_batch = batch.next_batch([X_train, y_train]) train_step.run(feed_dict={ X: x_batch, Y: y_batch, keep_prob: dropout_keep }) saver.save(sess, model_path, global_step=iteration * display_step) sys.stdout.write('*') x_batch_c, y_batch_c = cost_train_batch.next_batch([X_train, y_train]) x_batch_v, y_batch_v = cost_val_batch.next_batch([X_val, y_val]) c = sess.run(cost, feed_dict={ X: x_batch_c, Y: y_batch_c,
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) train_dir = os.path.join(config.log_root) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) # self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter_step): """保存模型""" state = { 'iter': iter_step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) model_save_path = os.path.join(self.model_dir, 'model_{}'.format(iter_step)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): # 初始化模型 self.model = Model(model_file_path) # 模型参数的列表 params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) # 定义优化器 self.optimizer = optim.Adam(params, lr=config.adam_lr) # 初始化迭代次数和损失 start_iter, start_loss = 0, 0 # 如果传入的已存在的模型路径,加载模型继续训练 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if USE_CUDA: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(DEVICE) return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.zero_grad() # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state( encoder_hidden) # (h,c) = ([1, B, hid_dim], [1, B, hid_dim]) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 # print("y_t_1:", y_t_1, y_t_1.size()) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] # 摘要的下一个单词的编码 # print("target-iter:", target, target.size()) # print("final_dist:", final_dist, final_dist.size()) # input("go on>>") # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000 gold_probs = torch.gather( final_dist, 1, target.unsqueeze(1)).squeeze() # 取出目标单词的概率gold_probs step_loss = -torch.log( gold_probs + config.eps) # 最大化gold_probs,也就是最小化step_loss(添加负号) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): # 训练设置,包括 iter_step, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter_step < n_iters: # 获取下一个batch数据 batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter_step) iter_step += 1 if running_avg_loss < 0.01: break if iter_step % 100 == 0: print('steps %d, seconds for %d ,' % (iter_step, time.time() - start)) print('loss:', loss) start = time.time() if iter_step % 500 == 0 and running_avg_loss > 0.001: self.save_model(running_avg_loss, iter_step)
def main(unused_argv): print("unused_argv: ", unused_argv) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) print("FLAGS.vocab_size: ", FLAGS.vocab_size) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary print("vocab size: ", vocab.size()) # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode != 'decode': raise Exception( "The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'fine_tune', 'train_size', 'subred_size', 'use_doc_vec', 'use_multi_attn', 'use_multi_pgen', 'use_multi_pvocab', 'create_ckpt' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness # return if hps.mode.value == 'train': print("creating model...") model = SummarizationModel(hps, vocab) # ------------------------------------- if hps.create_ckpt.value: step = 0 model.build_graph() print("get value") pretrained_ckpt = '/home/cs224u/pointer/log/pretrained_model_tf1.2.1/train/model-238410' reader = pywrap_tensorflow.NewCheckpointReader(pretrained_ckpt) var_to_shape_map = reader.get_variable_to_shape_map() value = {} for key in var_to_shape_map: value[key] = reader.get_tensor(key) print("assign op") assign_op = [] if hps.use_multi_pvocab.value: new_key = [ "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias", "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias" ] for v in tf.trainable_variables(): key = v.name.split(":")[0] if key in new_key: origin_key = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/" + key.split( "/")[-1] a_op = v.assign(tf.convert_to_tensor( value[origin_key])) else: a_op = v.assign(tf.convert_to_tensor(value[key])) # if key == "seq2seq/embedding/embedding": # a_op = v.assign(tf.convert_to_tensor(value[key])) assign_op.append(a_op) else: for v in tf.trainable_variables(): key = v.name.split(":")[0] if key == "seq2seq/embedding/embedding": a_op = v.assign(tf.convert_to_tensor(value[key])) assign_op.append(a_op) # ratio = 1 # for v in tf.trainable_variables(): # key = v.name.split(":")[0] # # embedding (50000, 128) -> (50000, 32) # if key == "seq2seq/embedding/embedding": # print (key) # print (value[key].shape) # d1 = value[key].shape[1] # a_op = v.assign(tf.convert_to_tensor(value[key][:,:d1//ratio])) # # kernel (384, 1024) -> (96, 256) # # w_reduce_c (512, 256) -> (128, 64) # elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel" or \ # key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel" or \ # key == "seq2seq/reduce_final_st/w_reduce_c" or \ # key == "seq2seq/reduce_final_st/w_reduce_h" or \ # key == "seq2seq/decoder/attention_decoder/Linear/Matrix" or \ # key == "seq2seq/decoder/attention_decoder/lstm_cell/kernel" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear/Matrix" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Matrix": # print (key) # print (value[key].shape) # d0, d1 = value[key].shape[0], value[key].shape[1] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio, :d1//ratio])) # # bias (1024,) -> (256,) # elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias" or \ # key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias" or \ # key == "seq2seq/reduce_final_st/bias_reduce_c" or \ # key == "seq2seq/reduce_final_st/bias_reduce_h" or \ # key == "seq2seq/decoder/attention_decoder/lstm_cell/bias" or \ # key == "seq2seq/decoder/attention_decoder/v" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Linear/Bias" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias": # print (key) # print (value[key].shape) # d0 = value[key].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio])) # # W_h (1, 1, 512, 512) -> (1, 1, 128, 128) # elif key == "seq2seq/decoder/attention_decoder/W_h": # print (key) # print (value[key].shape) # d2, d3 = value[key].shape[2], value[key].shape[3] # a_op = v.assign(tf.convert_to_tensor(value[key][:,:,:d2//ratio,:d3//ratio])) # # Matrix (1152, 1) -> (288, 1) # elif key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Matrix" or \ # key == "seq2seq/output_projection/w": # print (key) # print (value[key].shape) # d0 = value[key].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio,:])) # # Bias (1,) -> (1,) # elif key == "seq2seq/output_projection/v" or \ # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias": # print (key) # print (value[key].shape) # a_op = v.assign(tf.convert_to_tensor(value[key])) # # multi_attn # if hps.use_multi_attn.value: # if key == "seq2seq/decoder/attention_decoder/attn_0/v" or \ # key == "seq2seq/decoder/attention_decoder/attn_1/v": # # key == "seq2seq/decoder/attention_decoder/attn_2/v": # k = "seq2seq/decoder/attention_decoder/v" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # if key == "seq2seq/decoder/attention_decoder/Attention/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Attention/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/Attention/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/Attention/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # elif hps.use_multi_pgen.value: # if key == "seq2seq/decoder/attention_decoder/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # if key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias" # print (key) # print (value[k].shape) # a_op = v.assign(tf.convert_to_tensor(value[k])) # elif hps.use_multi_pvocab.value: # if key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias" or \ # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias": # # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_2/Bias": # k = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias" # print (key) # print (value[k].shape) # d0 = value[k].shape[0] # a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio])) # assign_op.append(a_op) # Add an op to initialize the variables. init_op = tf.global_variables_initializer() # Add ops to save and restore all the variables. saver = tf.train.Saver() with tf.Session(config=util.get_config()) as sess: sess.run(init_op) # Do some work with the model. for a_op in assign_op: a_op.op.run() for _ in range(0): batch = batcher.next_batch() results = model.run_train_step(sess, batch) # Save the variables to disk. if hps.use_multi_attn.value: ckpt_tag = "multi_attn_2_attn_proj" elif hps.use_multi_pgen.value: ckpt_tag = "multi_attn_2_pgen_proj" elif hps.use_multi_pvocab.value: ckpt_tag = "big_multi_attn_2_pvocab_proj" else: ckpt_tag = "pointer_proj" ckpt_to_save = '/home/cs224u/pointer/log/ckpt/' + ckpt_tag + '/model.ckpt-' + str( step) save_path = saver.save(sess, ckpt_to_save) print("Model saved in path: %s" % save_path) # ------------------------------------- else: setup_training(model, batcher, hps) elif hps.mode.value == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode.value == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
for i in range(len(predictions)): result.append(costs[i, predicted_indexes[i]]) return result batch = Batcher(n_examples, batch_size) cost_batch = Batcher(n_examples, 10000) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=50) for i in range(n_iterations): a_batch, h_batch, c_batch = batch.next_batch( [A_train, H_train, C_train]) if i % display_step == 0: a_cost, h_cost, c_cost = cost_batch.next_batch( [A_train, H_train, C_train]) c_train_pred = sess.run(cost_pred, feed_dict={ seq_in: a_cost, H: h_cost, C: c_cost, keep_prob: 1.0 }) c_train_reg = sess.run(cost_reg, feed_dict={ seq_in: a_cost, H: h_cost, C: c_cost,
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary link_mat_size = vocab._count print (link_mat_size) # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps','pointer_gen'] hps_dict = {} for key,val in FLAGS.__flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) print (hps) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) batch = batcher.next_batch() print (batch.art_oovs) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print "creating model..." model = SummarizationModel(hps, vocab) setup_training(model, batcher, vocab) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(5) # check the existence of log root file if not os.path.exists(config.log_root): os.mkdir(config.log_root) # check the existence of training model file self.model_dir = os.path.join(config.log_root, 'train_model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) # check the existence of training log file self.train_log = os.path.join(config.log_root, 'train_log') if not os.path.exists(self.train_log): os.mkdir(self.train_log) self.summary_writer = tf.summary.FileWriter(self.train_log) def save_model(self, running_avg_loss, iter, mode): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } if mode == 'train': save_model_dir = self.model_dir else: best_model_dir = os.path.join(config.log_root, 'best_model') if not os.path.exists(best_model_dir): os.mkdir(best_model_dir) save_model_dir = best_model_dir if len(os.listdir(save_model_dir)) > 0: shutil.rmtree(save_model_dir) time.sleep(2) os.mkdir(save_model_dir) model_save_path = os.path.join(save_model_dir, 'model_%d' % (iter)) torch.save(state, model_save_path) return model_save_path def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() min_val_loss = np.inf while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % config.print_interval == 0: tf.logging.info( 'steps %d, seconds for %d batch: %.2f , loss: %f, min_val_loss: %f' % (iter, config.print_interval, time.time() - start, loss, min_val_loss)) start = time.time() if iter % config.model_save_iters == 0: self.summary_writer.flush() model_save_path = self.save_model(running_avg_loss, iter, mode='train') tf.logging.info('Evaluate the model %s at validation set....' % model_save_path) evl_model = Evaluate(model_save_path) val_avg_loss = evl_model.run_eval() if val_avg_loss < min_val_loss: min_val_loss = val_avg_loss best_model_save_path = self.save_model(running_avg_loss, iter, mode='eval') tf.logging.info('Save best model at %s' % best_model_save_path)
def step(self): rollout = [] hyperparameters = self.hyperparameters env_info = self.environment.reset(train_mode=True)[self.brain_name] self.states = env_info.vector_observations states = self.states for _ in range(hyperparameters['rollout_length']): actions, log_probs, _, values = self.network(states) env_info = self.environment.step(actions.cpu().detach().numpy())[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards terminals = np.array([1 if t else 0 for t in env_info.local_done]) self.all_rewards += rewards for i, terminal in enumerate(terminals): if terminals[i]: self.episode_rewards.append(self.all_rewards[i]) self.all_rewards[i] = 0 rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - terminals]) states = next_states self.states = states pending_value = self.network(states)[-1] rollout.append([states, pending_value, None, None, None, None]) processed_rollout = [None] * (len(rollout) - 1) advantages = torch.Tensor(np.zeros((self.config['environment']['number_of_agents'], 1))) returns = pending_value.detach() for i in reversed(range(len(rollout) - 1)): states, value, actions, log_probs, rewards, terminals = rollout[i] terminals = torch.Tensor(terminals).unsqueeze(1) rewards = torch.Tensor(rewards).unsqueeze(1) actions = torch.Tensor(actions.cpu()) states = torch.Tensor(states) next_value = rollout[i + 1][1] returns = rewards + hyperparameters['discount_rate'] * terminals * returns.cpu() td_error = rewards + hyperparameters['discount_rate'] * terminals * next_value.detach().cpu() - value.detach().cpu() advantages = advantages * hyperparameters['tau'] * hyperparameters['discount_rate'] * terminals + td_error processed_rollout[i] = [states, actions, log_probs, returns, advantages] states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout)) advantages = (advantages - advantages.mean()) / advantages.std() batcher = Batcher(states.size(0) // hyperparameters['mini_batch_number'], [np.arange(states.size(0))]) for _ in range(hyperparameters['optimization_epochs']): batcher.shuffle() while not batcher.end(): batch_indices = batcher.next_batch()[0] batch_indices = torch.Tensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_probs_old = log_probs_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] _, log_probs, entropy_loss, values = self.network(sampled_states, sampled_actions) ratio = (log_probs - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp(1.0 - hyperparameters['ppo_clip'], 1.0 + hyperparameters['ppo_clip']) * sampled_advantages policy_loss = -torch.min(obj, obj_clipped).mean(0) - hyperparameters['entropy_coefficent'] * entropy_loss.mean() value_loss = 0.5 * (sampled_returns - values.cpu()).pow(2).mean() self.optimizier.zero_grad() (policy_loss + value_loss).backward() nn.utils.clip_grad_norm_(self.network.parameters(), hyperparameters['gradient_clip']) self.optimizier.step() steps = hyperparameters['rollout_length'] * self.config['environment']['number_of_agents'] self.total_steps += steps
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract = batch.original_abstracts_sents[0] write_for_rouge(original_abstract, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec' % (counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
def main(): embedding_dict_file = os.path.join(os.path.dirname(hps.word_count_path), 'emb_dict_50000.pkl') vocab = Vocab(hps.word_count_path, hps.glove_path, hps.embedding_dim, hps.max_vocab_size, embedding_dict_file) train_file = os.path.join(hps.data_path, 'train_raw.json') dev_file = os.path.join(hps.data_path, 'dev_raw.json') #'dev_raw.json') if (not os.path.exists(train_file)) \ or (not os.path.exists(dev_file)): raise Exception( 'train and dev data not exist in data_path, please check') if hps.save and not hps.exp_dir: raise Exception( 'please specify exp_dir when you want to save experiment info') print(vars(hps)) if hps.save: utils.save_hps(hps.exp_dir, hps) net = PointerNet(hps, vocab.emb_mat) net = net.cuda() model_parameters = list(filter(lambda p: p.requires_grad, net.parameters())) print('the number of parameters in model:', sum(p.numel() for p in model_parameters)) optimizer = optim.Adam(model_parameters) train_data_batcher = Batcher(train_file, vocab, hps, hps.single_pass) dev_data_batcher = Batcher(dev_file, vocab, hps, hps.single_pass) if hps.reward_metric == 'bleu': reward = get_batch_bleu global_step = 0 dev_loss_track = [] min_dev_loss = math.inf for i in range(hps.num_epoch): epoch_loss_track = [] train_data_batcher.setup() while True: start = time.time() try: batch = train_data_batcher.next_batch() #print('get next batch time:', time.time()-start) except StopIteration: # do evaluation here, if necessary, to save best model dev_data_batcher.setup() dev_loss = run_eval(dev_data_batcher, net) print( "epoch {}: avg train loss: {:>10.4f}, dev_loss: {:>10.4f}". format(i + 1, sum(epoch_loss_track) / len(epoch_loss_track), dev_loss)) dev_loss_track.append(dev_loss) if i > hps.early_stopping_from: last5devloss = dev_loss_track[i] + dev_loss_track[ i - 1] + dev_loss_track[i - 2] + dev_loss_track[ i - 3] + dev_loss_track[i - 4] last10devloss = dev_loss_track[i - 5] + dev_loss_track[ i - 6] + dev_loss_track[i - 7] + dev_loss_track[ i - 8] + dev_loss_track[i - 9] if hps.early_stopping_from and last5devloss >= last10devloss: print("early stopping by dev_loss!") sys.exit() if dev_loss < min_dev_loss: min_dev_loss = dev_loss if hps.save: utils.save_model(hps.exp_dir, net, min_dev_loss) break paragraph_tensor = torch.tensor(batch.enc_batch, dtype=torch.int64, requires_grad=False).cuda() question_tensor = torch.tensor(batch.dec_batch, dtype=torch.int64, requires_grad=False).cuda() answer_position_tensor = torch.tensor(batch.ans_indices, dtype=torch.int64, requires_grad=False).cuda() target_tensor = torch.tensor(batch.target_batch, dtype=torch.int64, requires_grad=False).cuda() paragraph_batch_extend_vocab = None max_para_oovs = None if hps.pointer_gen: paragraph_batch_extend_vocab = torch.tensor( batch.enc_batch_extend_vocab, dtype=torch.int64, requires_grad=False).cuda() max_para_oovs = batch.max_para_oovs optimizer.zero_grad() net.train() vocab_scores, vocab_dists, attn_dists, final_dists = net( paragraph_tensor, question_tensor, answer_position_tensor, paragraph_batch_extend_vocab, max_para_oovs) dec_padding_mask = torch.ne(target_tensor, 0).float().cuda() # for self-critic if hps.self_critic: greedy_seq = [ torch.argmax(dist, dim=1, keepdim=True) for dist in final_dists ] # each dist = [batch_size, vsize] greedy_seq_tensor = torch.cat(greedy_seq, dim=1) # [batch_size, seq_len] sample_seq = [] for dist in final_dists: m = torch.distributions.categorical.Categorical(probs=dist) sample_seq.append(m.sample()) # each is [batch_size,] sample_seq_tensor = torch.stack(sample_seq, dim=1) if hps.pointer_gen: loss_per_step = [] for dist, sample_tgt in zip(final_dists, sample_seq): # dist = [batch_size, extended_vsize] probs = torch.gather( dist, 1, sample_tgt.unsqueeze(1)).squeeze() losses = -torch.log(probs) loss_per_step.append(losses) # a list of [batch_size,] rl_loss = mask_and_avg(loss_per_step, dec_padding_mask, batch_average=False, step_average=False) # this rl_loss = [batch_size, ] else: # a list of dec_max_len (vocab_scores) loss_batch_by_step = F.cross_entropy( torch.stack(vocab_scores, dim=1).reshape(-1, vocab.size()), sample_seq_tensor.reshape(-1), size_average=False, reduce=False) # loss [batch_size*dec_max_len,] mask_loss_batch_by_step = loss_batch_by_step * dec_padding_mask.reshape( -1) batch_size = vocab_scores[0].size(0) rl_loss = torch.sum(mask_loss_batch_by_step.reshape( batch_size, -1), dim=1) r1 = reward(target_tensor, greedy_seq_tensor) r2 = reward(target_tensor, sample_seq_tensor) reward_diff = r1 - r2 final_rl_loss = reward_diff * rl_loss loss = torch.mean(final_rl_loss) print( 'r1: %.3f, r2: %.3f, reward_diff: %.3f, final rl loss: %.3f, loss batch mean: %.3f' % (torch.max(r1).item(), torch.max(r2).item(), torch.max(reward_diff).item(), torch.max(final_rl_loss).item(), loss.item())) # for maximum likelihood if hps.maxium_likelihood: if hps.pointer_gen: loss_per_step = [] for dec_step, dist in enumerate(final_dists): # dist = [batch_size, extended_vsize] targets = target_tensor[:, dec_step] gold_probs = torch.gather( dist, 1, targets.unsqueeze(1)).squeeze() losses = -torch.log(gold_probs) loss_per_step.append(losses) # a list of [batch_size,] loss = mask_and_avg(loss_per_step, dec_padding_mask) else: # a list of dec_max_len (vocab_scores) loss_batch_by_step = F.cross_entropy( torch.stack(vocab_scores, dim=1).reshape(-1, vocab.size()), target_tensor.reshape(-1), size_average=False, reduce=False) # loss [batch_size*dec_max_len,] loss = torch.sum(loss_batch_by_step * dec_padding_mask.reshape(-1)) / torch.sum( dec_padding_mask) epoch_loss_track.append(loss.item()) global_step += 1 loss.backward() nn.utils.clip_grad_norm_(net.parameters(), max_norm=hps.norm_limit) optimizer.step() #print('time one step:', time.time()-start) if (global_step == 1) or (global_step % hps.print_every == 0): print('Step {:>5}: ave loss: {:>10.4f}, speed: {:.1f} case/s'. format(global_step, sum(epoch_loss_track) / len(epoch_loss_track), hps.batch_size / (time.time() - start)))