예제 #1
0
파일: decode.py 프로젝트: Ginnna/PaddleNLP
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = re.findall(r'train_\d+', model_file_path)[0] + '_' + \
                     re.findall(r'model_\d+_\d+\.\d+', model_file_path)[0]
        print('o MODEL NAME: ', model_name)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:  #  and counter <= 100 # 11490
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 10 == 0:
                print('%d example in %d sec' % (counter, time.time() - start))
                start = time.time()
            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)

    def beam_search(self, batch):
        # The batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # Prepare decoder batch
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = paddle.to_tensor(latest_tokens)
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (paddle.stack(all_state_h, 0).unsqueeze(0),
                     paddle.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = paddle.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = paddle.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)
            log_probs = paddle.log(final_dist)
            topk_log_probs, topk_ids = paddle.topk(log_probs,
                                                   config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].numpy()[0],
                                        log_prob=topk_log_probs[i,
                                                                j].numpy()[0],
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
예제 #2
0
class Trainer(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(
            config.train_data_path,
            self.vocab,
            mode='train',
            batch_size=config.batch_size,
            single_pass=False)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))

        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'encoder': self.model.encoder.state_dict(),
            'decoder': self.model.decoder.state_dict(),
            'reduce_state': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        model_save_dir = os.path.join(self.model_dir, 'model_%06d_%.8f' %
                                      (iter, running_avg_loss))
        for k in state:
            model_save_path = os.path.join(model_save_dir, '%s.params' % k)
            paddle.save(state[k], model_save_path)
        return model_save_dir

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        assert len(params) == 31
        self.optimizer = Adagrad(
            parameters=params,
            learning_rate=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc,
            epsilon=1.0e-10,
            grad_clip=paddle.nn.ClipGradByGlobalNorm(
                clip_norm=config.max_grad_norm))

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            start_iter = int(model_file_path.split('_')[-2])
            start_loss = float(
                model_file_path.split('_')[-1].replace(os.sep, ''))

            if not config.is_coverage:
                self.optimizer.set_state_dict(
                    paddle.load(
                        os.path.join(model_file_path, 'optimizer.params')))

        return start_iter, start_loss

    def train_one_batch(self, batch, iter):

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)

        self.optimizer.clear_gradients()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]

            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = \
                self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask,
                                   c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di)

            target = target_batch[:, di]
            add_index = paddle.arange(0, target.shape[0])
            new_index = paddle.stack([add_index, target], axis=1)
            gold_probs = paddle.gather_nd(final_dist, new_index).squeeze()
            step_loss = -paddle.log(gold_probs + config.eps)

            if config.is_coverage:
                step_coverage_loss = paddle.sum(
                    paddle.minimum(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = paddle.sum(paddle.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = paddle.mean(batch_avg_loss)

        loss.backward()
        self.optimizer.minimize(loss)

        return loss.numpy()[0]

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            iter += 1
            print(
                'global step %d/%d, step loss: %.8f, running avg loss: %.8f, speed: %.2f step/s'
                % (iter, n_iters, loss, running_avg_loss,
                   1.0 / (time.time() - start)))
            start = time.time()
            if iter % 5000 == 0 or iter == 1000:
                model_save_dir = self.save_model(running_avg_loss, iter)
                print(
                    'Saved model for iter %d with running avg loss %.8f to directory: %s'
                    % (iter, running_avg_loss, model_save_dir))