Exemplo n.º 1
0
    def predict(self, text, tokenize=True, beam_search=True):
        """Generate summary.

        Args:
            text (str or list): Source.
            tokenize (bool, optional):
                Whether to do tokenize or not. Defaults to True.
            beam_search (bool, optional):
                Whether to use beam search or not.
                Defaults to True (means using greedy search).

        Returns:
            str: The final summary.
        """
        if isinstance(text, str) and tokenize:
            text = list(jieba.cut(text))
        x, oov = source2ids(text, self.vocab)
        x = torch.tensor(x).to(self.DEVICE)
        len_oovs = torch.tensor([len(oov)]).to(self.DEVICE)
        x_padding_masks = torch.ne(x, 0).byte().float()
        if beam_search:
            summary = self.beam_search(x.unsqueeze(0),
                                       max_sum_len=config.max_dec_steps,
                                       beam_width=config.beam_size,
                                       len_oovs=len_oovs,
                                       x_padding_masks=x_padding_masks)
        else:
            summary = self.greedy_search(x.unsqueeze(0),
                                         max_sum_len=config.max_dec_steps,
                                         len_oovs=len_oovs,
                                         x_padding_masks=x_padding_masks)
        summary = outputids2words(summary, oov, self.vocab)
        return summary.replace('<SOS>', '').replace('<EOS>', '').strip()
Exemplo n.º 2
0
 def __getitem__(self, index):
     x, oov = source2ids(self.src_sents[index], self.vocab)
     return {
         'x': [self.vocab.SOS] + x + [self.vocab.EOS],
         'OOV': oov,
         'len_OOV': len(oov),
         'y': [self.vocab.SOS] +
              [self.vocab[x] for x in self.trg_sents[index]] +
              [self.vocab.EOS],
         'x_len': len(self.src_sents[index]),
         'y_len': len(self.trg_sents[index])
     }
    def get_sample(self, text):
        """Build a single sample.

        Args:
            text (str): source + '\t' + target + '\t' + cate + '\t' + imgid
            The string represents a raw sample.

        Returns:
            dict: A sample.
        """
        sample = {}
        source, target, cate, imgid = text.split("\t")
        
        src = self.tokenizer(source)
        if config.max_src_len and len(src) > config.max_src_len:
            if config.truncate_src:
                src = src[:config.max_src_len]
        sample['src'] = src

        tgt = self.tokenizer(target)
        if config.max_tgt_len and len(tgt) > config.max_tgt_len:
            if config.truncate_tgt:
                tgt = tgt[:config.max_tgt_len]
        sample['tgt'] = tgt

        img_vec = ' '.join(['0'] * config.img_vec_dim)
        if img_vec != '':
            try:
                img_vec = self.img_vecs[imgid]
            except Exception as e:
                img_vec = self.img_vecs[imgid.strip()+'.jpg']
        sample['img_vec'] = img_vec

        sample['cate'] = cate

        x, oov = source2ids(src, self.vocab)
        y = abstract2ids(sample['tgt'], self.vocab, oov)

        output = {
            'source': source,
            'tgt': target, 
            'x': [self.vocab.SOS] + x + [self.vocab.EOS],
            'OOV': oov,
            'len_OOV': len(oov),
            'y': [self.vocab.SOS] + y + [self.vocab.EOS],
            'x_len': len(sample['src'])+2,
            'y_len': len(sample['tgt'])+2,
            'img_vec': sample['img_vec']
        }
        
        return output
 def __getitem__(self, item):
     if type(item) is int:
         x, oov = source2ids(self.samples[item]['src'], self.vocab)
         y = abstract2ids(self.samples[item]['tgt'], self.vocab, oov)
         return {
             'source': self.samples[item]['src'],
             'x': [self.vocab.SOS] + x + [self.vocab.EOS],
             'OOV': oov,
             'len_OOV': len(oov),
             'y': [self.vocab.SOS] + y + [self.vocab.EOS],
             'x_len': len(self.samples[item]['src'])+2,
             'y_len': len(self.samples[item]['tgt'])+2,
             'img_vec': self.samples[item]['img_vec']
         }
     return [sample[item] for sample in self.samples]