def predict(self, text, tokenize=True, beam_search=True): """Generate summary. Args: text (str or list): Source. tokenize (bool, optional): Whether to do tokenize or not. Defaults to True. beam_search (bool, optional): Whether to use beam search or not. Defaults to True (means using greedy search). Returns: str: The final summary. """ if isinstance(text, str) and tokenize: text = list(jieba.cut(text)) x, oov = source2ids(text, self.vocab) x = torch.tensor(x).to(self.DEVICE) len_oovs = torch.tensor([len(oov)]).to(self.DEVICE) x_padding_masks = torch.ne(x, 0).byte().float() if beam_search: summary = self.beam_search(x.unsqueeze(0), max_sum_len=config.max_dec_steps, beam_width=config.beam_size, len_oovs=len_oovs, x_padding_masks=x_padding_masks) else: summary = self.greedy_search(x.unsqueeze(0), max_sum_len=config.max_dec_steps, len_oovs=len_oovs, x_padding_masks=x_padding_masks) summary = outputids2words(summary, oov, self.vocab) return summary.replace('<SOS>', '').replace('<EOS>', '').strip()
def __getitem__(self, index): x, oov = source2ids(self.src_sents[index], self.vocab) return { 'x': [self.vocab.SOS] + x + [self.vocab.EOS], 'OOV': oov, 'len_OOV': len(oov), 'y': [self.vocab.SOS] + [self.vocab[x] for x in self.trg_sents[index]] + [self.vocab.EOS], 'x_len': len(self.src_sents[index]), 'y_len': len(self.trg_sents[index]) }
def get_sample(self, text): """Build a single sample. Args: text (str): source + '\t' + target + '\t' + cate + '\t' + imgid The string represents a raw sample. Returns: dict: A sample. """ sample = {} source, target, cate, imgid = text.split("\t") src = self.tokenizer(source) if config.max_src_len and len(src) > config.max_src_len: if config.truncate_src: src = src[:config.max_src_len] sample['src'] = src tgt = self.tokenizer(target) if config.max_tgt_len and len(tgt) > config.max_tgt_len: if config.truncate_tgt: tgt = tgt[:config.max_tgt_len] sample['tgt'] = tgt img_vec = ' '.join(['0'] * config.img_vec_dim) if img_vec != '': try: img_vec = self.img_vecs[imgid] except Exception as e: img_vec = self.img_vecs[imgid.strip()+'.jpg'] sample['img_vec'] = img_vec sample['cate'] = cate x, oov = source2ids(src, self.vocab) y = abstract2ids(sample['tgt'], self.vocab, oov) output = { 'source': source, 'tgt': target, 'x': [self.vocab.SOS] + x + [self.vocab.EOS], 'OOV': oov, 'len_OOV': len(oov), 'y': [self.vocab.SOS] + y + [self.vocab.EOS], 'x_len': len(sample['src'])+2, 'y_len': len(sample['tgt'])+2, 'img_vec': sample['img_vec'] } return output
def __getitem__(self, item): if type(item) is int: x, oov = source2ids(self.samples[item]['src'], self.vocab) y = abstract2ids(self.samples[item]['tgt'], self.vocab, oov) return { 'source': self.samples[item]['src'], 'x': [self.vocab.SOS] + x + [self.vocab.EOS], 'OOV': oov, 'len_OOV': len(oov), 'y': [self.vocab.SOS] + y + [self.vocab.EOS], 'x_len': len(self.samples[item]['src'])+2, 'y_len': len(self.samples[item]['tgt'])+2, 'img_vec': self.samples[item]['img_vec'] } return [sample[item] for sample in self.samples]