def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()
        # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(
            encoder_hidden)  # (h,c) = ([1, B, hid_dim], [1, B, hid_dim])
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            # print("y_t_1:", y_t_1, y_t_1.size())
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]  # 摘要的下一个单词的编码
            # print("target-iter:", target, target.size())
            # print("final_dist:", final_dist, final_dist.size())
            # input("go on>>")
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000
            gold_probs = torch.gather(
                final_dist, 1,
                target.unsqueeze(1)).squeeze()  # 取出目标单词的概率gold_probs
            step_loss = -torch.log(
                gold_probs + config.eps)  # 最大化gold_probs,也就是最小化step_loss(添加负号)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()
Пример #2
0
    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.item()
    def train_one_batch(self, batch):
        # enc_batch是包含unk的序列
        # c_t_1是初始上下文向量
        # extra_zeros:oov词汇表概率,[batch_size, batch.max_art_oovs]
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        # dec_batch是普通摘要序列,包含unk,target_batch是目标词序列,不包含unk,unk的词用len(vocabe)+oov相对位置代替
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()

        # [batch, seq_lens, 2*hid_dim],[batch*max(seq_lens), 2*hid_dim],[2, batch, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)

        # (h,c) = ([1, batch, hid_dim], [1, batch, hid_dim])
        # 之前的hidden state是双向的[2, batch, hid_dim],需要转成1维的[1, batch, hid_dim],作为新的decoder的hidden输入
        s_t_1 = self.model.reduce_state(encoder_hidden)  # h,c

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            y_t_1 = dec_batch[:, di]
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的vocab size
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            # 摘要的下一个单词的编码,[B]
            target = target_batch[:, di]
            # [B,1]
            target_i = target.unsqueeze(1)
            # 取出目标单词的概率//取出final_dist中,target中对应位置的数据(对于目标单词预测的概率)
            gold_probs = torch.gather(final_dist, 1, target_i).squeeze()

            #print(gold_probs)

            # if gold_probs <= 0:
            #     print('*******loss less than 0 ***********')
            #     gold_probs = 1e-2
            #     print('pro has been modified', gold_probs)
            #     print('\n')

            # 单个词的预测损失
            # 加入绝对值
            step_loss = -torch.log(torch.abs(gold_probs) + 1e-8)

            #print('')
            if config.is_coverage:
                # 取当前t步attention向量,和之前t-1步attention和向量,的min值做sum,当作额外的coverage loss来压制重复生成。
                # 迫使loss让当前第t步的attention向量attn_dist值,尽可能比之前t-1步attention和向量的值小。(大的的attention值意味着之前可能被预测生成了这个词)
                step_coverage_loss = torch.sum(
                    torch.min(torch.abs(attn_dist), torch.abs(coverage)), 1)
                #print('step_coverage_loss is ', step_coverage_loss)
                # 加个\lambda 系数,表示多大程度考虑这个压制重复的coverage loss
                step_loss = step_loss + config.cov_loss_wt * torch.abs(
                    step_coverage_loss)
                # 初始时的coverage覆盖向量,就更新成累加了
                coverage = next_coverage
            # mask的部位不计入损失
            step_mask = dec_padding_mask[:, di]
            step_loss = torch.abs(step_loss) * torch.abs(step_mask)
            step_losses.append(step_loss)

        sum_losses = torch.abs(torch.sum(torch.stack(step_losses, 1), 1))
        # print('sum_losses is ',sum_losses)
        # 序列的整体损失
        # print('dec_lens_var is ', dec_lens_var)
        batch_avg_loss = sum_losses / (torch.abs(dec_lens_var) + 1)

        # 整个batch的整体损失
        loss = torch.mean(batch_avg_loss)
        #print('loss from one_batch is ', loss)

        loss.backward()

        #         self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        #         clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        #         clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()
Пример #4
0
    def beam_search(self, batch):
        # batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0 # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                      log_probs=[0.0],
                      state=(dec_h[0], dec_c[0]),
                      context = c_t_0[0],
                      coverage=(coverage_t_0[0] if config.is_coverage else None))
                 for _ in range(config.beam_size)]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if USE_CUDA:
                y_t_1 = y_t_1.to(DEVICE)
            all_state_h =[]
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps)
            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size * 2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                   log_prob=topk_log_probs[i, j].item(),
                                   state=state_i,
                                   context=context_i,
                                   coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
Пример #5
0
    def train_one_batch(self, batch):
        """
        训练一个batch,返回该batch的loss。
        enc_batch:             torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码;
        enc_padding_mask:      torch.Size([16, 400]), 对应pad的位置为0,其余为1;
        enc_lens:              numpy.ndarray, 列表内每个元素表示每篇article的单词数;
        enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码;
        extra_zeros:           torch.Size([16, 文章oov词汇数量]) zero tensor;
        c_t_1:                 torch.Size([16, 512]) zero tensor;
        coverage:              Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值
        ----------------------------------------
        dec_batch:             torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD;
        dec_padding_mask:      torch.Size([16, 100]) 对应pad的位置为0,其余为1;
        max_dec_len:           标量,摘要词语数量,不包含pad
        dec_lens_var:          torch.Size([16] 摘要词汇数量         
        target_batch:          torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD
        """
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()
        """
        # 记得修改Batch类添加vocab属性
 
        print("模型输入文章编码:", "*"*100)
        print("enc_batch:", enc_batch, enc_batch.size())
        print("enc_batch[-1]:", enc_batch[-1])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()])
        print("-"*50)
        print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size())
        print("-"*50)
        print("enc_lens:", enc_lens, enc_lens.shape)
        print("-"*50)
        print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size())
        print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1])
        print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()])
        print("-"*50)
        print("extra_zeros:", extra_zeros, extra_zeros.size())
        print("-"*50)
        print("c_t_1:", c_t_1, c_t_1.size())
        print("-"*50)
        print("coverage:", coverage)
        print("*"*100)
        
        print("模型输入摘要编码,包括源和目标:", "*"*100)
        print("dec_batch:", dec_batch, dec_batch.size())
        print("dec_batch[0]:", dec_batch[0])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()])
        print("-"*50)
        print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size())
        print("-"*50)
        print("max_dec_len:", max_dec_len)
        print("-"*50)
        print("dec_lens_var", dec_lens_var, dec_lens_var.size())
        print("-"*50)
        print("target_batch:", target_batch, target_batch.size())
        print("-"*50)
        print("target_batch[0]:", target_batch[0], target_batch[0].size())
        print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()])
        print("*"*100)
        input("任意键继续>>>")
        """
        # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)   # (h,c) = ([1, B, hid_dim], [1, B, hid_dim])
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]      # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            # print("y_t_1:", y_t_1, y_t_1.size())
            final_dist, s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage, di)
            target = target_batch[:, di]  # 摘要的下一个单词的编码
            # print("target-iter:", target, target.size())
            # print("final_dist:", final_dist, final_dist.size())
            # input("go on>>")
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()   # 取出目标单词的概率gold_probs
            step_loss = -torch.log(gold_probs + config.eps)  # 最大化gold_probs,也就是最小化step_loss(添加负号)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage
                
            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses/dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()