def _paraphrase(self, strings, decoder='greedy', top_p=0.7): if not 0 < top_p < 1: raise ValueError('top_p must be bigger than 0 and less than 1') decoder = decoder.lower() if decoder not in ['greedy', 'beam', 'nucleus']: raise ValueError( 'mode only supports [`greedy`, `beam`, `nucleus`]') strings = [ f'parafrasa: {summarization_textcleaning(string)}' for string in strings ] batch_x = [self._tokenizer.encode(string) + [1] for string in strings] batch_x = padding_sequence(batch_x) r = self._execute( inputs=[batch_x, top_p], input_labels=['Placeholder', 'Placeholder_2'], output_labels=[decoder], ) p = r[decoder].tolist() results = [self._tokenizer.decode(r) for r in p] return results
def _predict(self, strings): sequences = [ encode_sentencepiece( self._tokenizer.sp, string, return_unicode=False, sample=False, ) for string in strings ] batch_x = [self._tokenizer.encode(string) + [1] for string in strings] batch_x = padding_sequence(batch_x) r = self._execute( inputs=[batch_x], input_labels=['x_placeholder'], output_labels=['greedy', 'tag_greedy'], ) p, tag = r['greedy'], r['tag_greedy'] results = [] nonzero = (p != 0).sum(axis=-1) for i in range(len(p)): r = self._tokenizer.decode(p[i].tolist()) t = tag[i, :nonzero[i]] s = encode_sentencepiece(self._tokenizer.sp, r, return_unicode=False, sample=False) merged = merge_sentencepiece_tokens_tagging(s + ['<cls>'], t, model='xlnet') results.append(list(zip(merged[0], merged[1]))) return results
def _summarize(self, strings, mode, decoder = 'greedy', top_p = 0.7): mode = mode.lower() if mode not in ['ringkasan', 'tajuk']: raise ValueError('mode only supports [`ringkasan`, `tajuk`]') if not 0 < top_p < 1: raise ValueError('top_p must be bigger than 0 and less than 1') decoder = decoder.lower() output = self._mapping.get(decoder) if not decoder: raise ValueError('mode only supports [`greedy`, `beam`, `nucleus`]') strings = [f'{mode}: {cleaning(string)}' for string in strings] batch_x = [self._tokenizer.encode(string) + [1] for string in strings] batch_x = padding_sequence(batch_x) p = self._sess.run( output, feed_dict = {self._X: batch_x, self._top_p: top_p} ).tolist() results = [ postprocessing_summarization(self._tokenizer.decode(r)) for r in p ] return results
def _attention(self, strings): batch_x, _, _, s_tokens = bert_tokenization( self._tokenizer, strings, cls = self._cls, sep = self._sep ) maxlen = max([len(s) for s in s_tokens]) s_tokens = padding_sequence(s_tokens, maxlen, pad_int = self._sep) attentions = self._sess.run(self.attns, feed_dict = {self.X: batch_x}) return attentions, s_tokens
def _attention(self, strings): batch_x, batch_masks, _, s_tokens = bert_tokenization( self._tokenizer, strings ) maxlen = max([len(s) for s in s_tokens]) s_tokens = padding_sequence(s_tokens, maxlen, pad_int = '[SEP]') attentions = self._sess.run( self.attns, feed_dict = {self.X: batch_x, self.MASK: batch_masks} ) return attentions, s_tokens, batch_masks
def _attention(self, strings): input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, strings) maxlen = max([len(s) for s in s_tokens]) s_tokens = padding_sequence(s_tokens, maxlen, pad_int='<cls>') attentions = self._sess.run( self.attention_nodes, feed_dict={ self.X: input_ids, self.segment_ids: segment_ids, self.input_masks: input_masks, }, ) return attentions, s_tokens, input_masks
def _summarize( self, strings, mode, decoder='greedy', top_p=0.7, postprocess=True, **kwargs, ): mode = mode.lower() if mode not in ['ringkasan', 'tajuk']: raise ValueError('mode only supports [`ringkasan`, `tajuk`]') if not 0 < top_p < 1: raise ValueError('top_p must be bigger than 0 and less than 1') decoder = decoder.lower() if decoder not in ['greedy', 'beam', 'nucleus']: raise ValueError( 'mode only supports [`greedy`, `beam`, `nucleus`]') strings_ = [ f'{mode}: {summarization_textcleaning(string)}' for string in strings ] batch_x = [self._tokenizer.encode(string) + [1] for string in strings_] batch_x = padding_sequence(batch_x) r = self._execute( inputs=[batch_x, top_p], input_labels=['Placeholder', 'Placeholder_2'], output_labels=[decoder], ) p = r[decoder].tolist() results = [] for no, r in enumerate(p): summary = self._tokenizer.decode(r) if postprocess and mode != 'tajuk': summary = postprocess_summary(strings[no], summary, **kwargs) results.append(summary) return results
def _summarize( self, strings, mode, decoder='greedy', top_p=0.7, postprocess=True, **kwargs, ): mode = mode.lower() if mode not in ['ringkasan', 'tajuk']: raise ValueError('mode only supports [`ringkasan`, `tajuk`]') if not 0 < top_p < 1: raise ValueError('top_p must be bigger than 0 and less than 1') decoder = decoder.lower() output = self._mapping.get(decoder) if not decoder: raise ValueError( 'mode only supports [`greedy`, `beam`, `nucleus`]') strings_ = [f'{mode}: {cleaning(string)}' for string in strings] batch_x = [self._tokenizer.encode(string) + [1] for string in strings_] batch_x = padding_sequence(batch_x) p = self._sess.run(output, feed_dict={ self._X: batch_x, self._top_p: top_p }).tolist() results = [] for no, r in enumerate(p): summary = self._tokenizer.decode(r) if postprocess: summary = filter_rouge(strings[no], summary, **kwargs) summary = postprocessing_summarization(summary) summary = find_lapor_and_remove(strings[no], summary) results.append(summary) return results
def _paraphrase(self, strings, decoder = 'greedy', top_p = 0.7): if not 0 < top_p < 1: raise ValueError('top_p must be bigger than 0 and less than 1') decoder = decoder.lower() output = self._mapping.get(decoder) if not decoder: raise ValueError('mode only supports [`greedy`, `beam`, `nucleus`]') strings = [f'parafrasa: {cleaning(string)}' for string in strings] batch_x = [self._tokenizer.encode(string) + [1] for string in strings] batch_x = padding_sequence(batch_x) p = self._sess.run( output, feed_dict = {self._X: batch_x, self._top_p: top_p} ).tolist() results = [self._tokenizer.decode(r) for r in p] return results