class BeamSearch(object): def __init__(self, model_file_path): model_name = re.findall(r'train_\d+', model_file_path)[0] + '_' + \ re.findall(r'model_\d+_\d+\.\d+', model_file_path)[0] print('o MODEL NAME: ', model_name) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # and counter <= 100 # 11490 # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 10 == 0: print('%d example in %d sec' % (counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): # The batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # Prepare decoder batch beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = paddle.to_tensor(latest_tokens) all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (paddle.stack(all_state_h, 0).unsqueeze(0), paddle.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = paddle.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = paddle.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = paddle.log(final_dist) topk_log_probs, topk_ids = paddle.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].numpy()[0], log_prob=topk_log_probs[i, j].numpy()[0], state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class Trainer(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher( config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(config.log_root): os.mkdir(config.log_root) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) def save_model(self, running_avg_loss, iter): state = { 'encoder': self.model.encoder.state_dict(), 'decoder': self.model.decoder.state_dict(), 'reduce_state': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict() } model_save_dir = os.path.join(self.model_dir, 'model_%06d_%.8f' % (iter, running_avg_loss)) for k in state: model_save_path = os.path.join(model_save_dir, '%s.params' % k) paddle.save(state[k], model_save_path) return model_save_dir def setup_train(self, model_file_path=None): self.model = Model(model_file_path) initial_lr = config.lr_coverage if config.is_coverage else config.lr params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) assert len(params) == 31 self.optimizer = Adagrad( parameters=params, learning_rate=initial_lr, initial_accumulator_value=config.adagrad_init_acc, epsilon=1.0e-10, grad_clip=paddle.nn.ClipGradByGlobalNorm( clip_norm=config.max_grad_norm)) start_iter, start_loss = 0, 0 if model_file_path is not None: start_iter = int(model_file_path.split('_')[-2]) start_loss = float( model_file_path.split('_')[-1].replace(os.sep, '')) if not config.is_coverage: self.optimizer.set_state_dict( paddle.load( os.path.join(model_file_path, 'optimizer.params'))) return start_iter, start_loss def train_one_batch(self, batch, iter): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) self.optimizer.clear_gradients() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = \ self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] add_index = paddle.arange(0, target.shape[0]) new_index = paddle.stack([add_index, target], axis=1) gold_probs = paddle.gather_nd(final_dist, new_index).squeeze() step_loss = -paddle.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = paddle.sum( paddle.minimum(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = paddle.sum(paddle.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = paddle.mean(batch_avg_loss) loss.backward() self.optimizer.minimize(loss) return loss.numpy()[0] def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 print( 'global step %d/%d, step loss: %.8f, running avg loss: %.8f, speed: %.2f step/s' % (iter, n_iters, loss, running_avg_loss, 1.0 / (time.time() - start))) start = time.time() if iter % 5000 == 0 or iter == 1000: model_save_dir = self.save_model(running_avg_loss, iter) print( 'Saved model for iter %d with running avg loss %.8f to directory: %s' % (iter, running_avg_loss, model_save_dir))