def __init__(self, args, model_name=None): self.args = args vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file) self.batcher = Batcher(args.train_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=False, args=args) self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='eval', batch_size=args.batch_size, single_pass=True, args=args) time.sleep(15) if model_name is None: self.train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) else: self.train_dir = os.path.join(config.log_root, model_name) if not os.path.exists(self.train_dir): os.mkdir(self.train_dir) self.model_dir = os.path.join(self.train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self, train_dir=None, eval_dir=None, vocab=None, vectors=None): self.vectors = vectors if vocab is None: self.vocab = Vocab(config.vocab_path, config.vocab_size) else: self.vocab = vocab print(self.vocab) self.batcher_train = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) self.batcher_eval = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) cur_time = int(time.time()) if train_dir is None: train_dir = os.path.join(config.log_root, 'train_%d' % (cur_time)) if not os.path.exists(train_dir): os.mkdir(train_dir) if eval_dir is None: eval_dir = os.path.join(config.log_root, 'eval_%s' % (cur_time)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer_train = writer.FileWriter(train_dir) self.summary_writer_eval = writer.FileWriter(eval_dir)
def __init__(self): self.vocab = Vocab(args.vocab_path, args.vocab_size) self.batcher = Batcher( args.decode_data_path, self.vocab, mode='decode', batch_size=1, single_pass=True) # support only 1 item at a time time.sleep(15) vocab_size = self.vocab.size() self.beam_size = args.beam_size # self.bertClient = BertClient() self.encoder = EncoderLSTM(args.hidden_size, self.vocab.size()) self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size()) if use_cuda: self.encoder = self.encoder.cuda() self.decoder = self.decoder.cuda() # Prepare the output folder and files output_dir = os.path.join(args.logs, "outputs") if not os.path.exists(output_dir): os.mkdir(output_dir) output_file = os.path.join(output_dir, "decoder_{}.txt".format(args.output_name)) self.file = open(output_file, "w+")
def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) # self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') # self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') # for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: # if not os.path.exists(p): # os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) # self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', # batch_size=config.beam_size, single_pass=True) decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/decode_file.txt" # decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/data_file.txt" self.batcher = Batcher(data_path=decode_data_path, vocab=self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) # time.sleep(15) time.sleep(2) self.model = Model(model_file_path, is_eval=True)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) self.val_batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
class Evaluate(object): def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval', config.batch_size, single_pass=True) time.sleep(5) eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time()))) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True) def eval(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1 = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t, c_t, _, _ = self.model.decoder(y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data[0] def run_eval(self): start = time.time() running_avg_loss, iter = 0, 0 batch = self.batcher.next_batch() while batch is not None: loss = self.eval(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % ( iter, print_interval, time.time() - start, loss)) start = time.time() batch = self.batcher.next_batch()
def __init__(self, data_path, opt, batch_size=config.batch_size): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt =opt time.sleep(5)
def __init__(self): self.vocab = Vocab(VOCAB_PATH, VOCAB_SIZE) self.batcher = Batcher(TRAIN_DATA_PATH, self.vocab, mode = 'train',batch_size = BATCH_SIZE, single_pass = False) self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) self.model = MyModel().to(DEVICE) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.train_batcher = Batcher(config.train_data_path, self.vocab, hps=config.hps, single_pass=False) self.val_batcher = Batcher(config.eval_data_path, self.vocab, hps=config.hps, single_pass=False)
def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.opt = opt self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) time.sleep(5)
def __init__(self, opt): ''' opt needs to contain: - model_file_path - n_best - max_token_seq_len ''' self.opt = opt self.device = torch.device('cuda' if use_cuda else 'cpu') print("Max article len", config.max_article_len) model = Model(config.vocab_size, config.vocab_size, config.max_article_len) checkpoint = torch.load(opt["model_file_path"], map_location=lambda storage, location: storage) # model saved as: # state = { # 'iter': iter, # 'transformer_state_dict': self.model.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'current_loss': running_avg_loss # } model.load_state_dict(checkpoint['transformer_state_dict']) print('[Info] Trained model state loaded.') #model.word_prob_prj = nn.LogSoftmax(dim=1) self.model = model.to(self.device) self.model.eval() self._decode_dir = os.path.join( config.log_root, 'decode_%s' % (opt["model_file_path"].split("/")[-1])) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.batch_size, single_pass=True) time.sleep(15) print('[Info] Summarizer object created.')
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir)
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval', config.batch_size, single_pass=True) time.sleep(5) eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time()))) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def __init__(self, model_file_path, destination_dir): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.encode_data_path, self.vocab, mode='encode', batch_size=config.batch_size, single_pass=True) time.sleep(5) self.output = {} self.destination_dir = destination_dir self.model = Model(model_file_path, is_eval=True)
def __init__(self, model_file_path, model_type="stem", load_batcher=True): self.vocab = Vocab(config.vocab_path, config.vocab_size) if load_batcher: self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) self.model_type = model_type
def load_batches_decode(): vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.decode_data_path, vocab, mode='decode', batch_size=config.beam_size, single_pass=True) batches = [None for _ in range(TEST_DATA_SIZE)] for i in range(TEST_DATA_SIZE): batch = batcher.next_batch() batches[i] = batch with open("lib/data/batches_test.vocab{}.beam{}.pk.bin".format(vocab.size(), config.beam_size), "wb") as f: pickle.dump(batches, f)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) #train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) train_dir = './train_log' if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self, model_file_path): self._decode_dir = os.path.join(config.log_root, 'decode_%d' % (int(time.time()))) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True)
def load_batches_train(): vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.decode_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False) TRAIN_DATA_SIZE = 287226 num_batches = int(TRAIN_DATA_SIZE / config.batch_size) batches = [None for _ in range(num_batches)] for i in tqdm(range(num_batches)): batch = batcher.next_batch() batches[i] = batch with open("lib/data/batches_train.vocab{}.batch{}.pk.bin".format(vocab.size(), config.batch_size), "wb") as f: pickle.dump(batches, f)
def __init__(self, args): self.hparams = hp() self.model = Model(self.hparams) self.vocab = Vocab(config.vocab_path, self.hparams.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=self.hparams.batch_size, single_pass=False) self.args = args self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) time.sleep(3)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) # print("MODE MUST BE train") # time.sleep(15) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self, use_elmo=False, finetune_glove=False): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.use_elmo = use_elmo self.finetune_glove = finetune_glove time.sleep(15) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
def __init__(self, model_file_or_model, vocab=None): if vocab is None: self.vocab = Vocab(config.vocab_path, config.vocab_size) else: assert isinstance(vocab, Vocab) self.vocab = vocab self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) if isinstance(model_file_or_model, str): self.model = Model(device, model_file_or_model, is_eval=True) elif isinstance(model_file_or_model, Model): self.model = model_file_or_model else: raise ValueError("Cannot build model from type %s" % type(model_file_or_model))
class Encode(object): def __init__(self, model_file_path, destination_dir): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.encode_data_path, self.vocab, mode='encode', batch_size=config.batch_size, single_pass=True) time.sleep(5) self.output = {} self.destination_dir = destination_dir self.model = Model(model_file_path, is_eval=True) def save_output(self, output, destination_dir): if destination_dir is None: torch.save(output, "output") else: torch.save(output, destination_dir) def encode_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) h, c = self.model.reduce_state(encoder_hidden) h, c = h.squeeze(0), c.squeeze(0) encodes = torch.cat((h, c), 1) for id, encode in zip(batch.original_abstracts, encodes): print(encode) self.output[id] = encode def run_encode(self): start = time.time() batch = self.batcher.next_batch() while batch is not None: self.encode_one_batch(batch) batch = self.batcher.next_batch() self.save_output(self.output, self.destination_dir)
def __init__(self, opt, vocab, logger, writer, train_num): self.vocab = vocab self.train_batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.test_batcher = Batcher(config.test_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) self.opt = opt self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) self.logger = logger self.writer = writer self.train_num = train_num time.sleep(5)
def __init__(self, args, model_file_path, save_path): self.args = args model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, save_path, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file) self.batcher = Batcher(args.decode_data_path, self.vocab, mode='decode', batch_size=args.beam_size, single_pass=True, args=args) time.sleep(15) self.model = Model(self.vocab, model_file_path, is_eval=True)
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def __init__(self, model_file_path=None): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) if not model_file_path: train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) else: train_dir = re.sub('/model/model.*', '', model_file_path) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.create_file_writer(train_dir)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.ouput_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.makedirs(train_dir) self.checkpoint_dir = os.path.join(train_dir, 'checkpoints') if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'train')) self.eval_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'eval'))
def __init__(self): """ Input: vocab_path = "xxx/finished_files/vocab", vocab_size = 50000 Output: class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`) """ self.vocab = Vocab(config.vocab_path, config.vocab_size) """ Input: train_data_path = "xxx/finished_files/chunked/train_*", self.vocab: class object, mode = 'train', for training, batch_size = 8, single_pass = False Output: class object: self.vocab, (dicts `_word_to_id` and `_id_to_word`) """ self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def __init__(self): self.vocab = Vocab(args.vocab_path, args.vocab_size) sys.stdout.flush() self.batcher = Batcher(args.train_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=False) time.sleep(15) vocab_size = self.vocab.size() self.model = BertLSTMModel(args.hidden_size, self.vocab.size(), args.max_dec_steps) # self.model = Seq2SeqLSTM(args.hidden_size, self.vocab.size(), args.max_dec_steps) if use_cuda: self.model = self.model.cuda() self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr) train_logs = os.path.join(args.logs, "train_logs") eval_logs = os.path.join(args.logs, "eval_logs") self.train_summary_writer = tf.summary.FileWriter(train_logs) self.eval_summary_writer = tf.summary.FileWriter(eval_logs)
def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True)
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words(output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec'%(counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) if config.use_maxpool_init_ctx: c_t_0 = max_encoder_output dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context = c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in xrange(config.beam_size)] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h =[] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1) topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in xrange(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in xrange(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].data[0], log_prob=topk_log_probs[i, j].data[0], state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len(results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class Evaluate(object): def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True) def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) if config.use_maxpool_init_ctx: c_t_1 = max_encoder_output step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1,attn_dist, p_gen, coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data[0] def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % ( iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() batch = self.batcher.next_batch()