示例#1
0
    def _paraphrase(self, strings, decoder='greedy', top_p=0.7):

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        if decoder not in ['greedy', 'beam', 'nucleus']:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings = [
            f'parafrasa: {summarization_textcleaning(string)}'
            for string in strings
        ]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings]
        batch_x = padding_sequence(batch_x)

        r = self._execute(
            inputs=[batch_x, top_p],
            input_labels=['Placeholder', 'Placeholder_2'],
            output_labels=[decoder],
        )
        p = r[decoder].tolist()

        results = [self._tokenizer.decode(r) for r in p]
        return results
示例#2
0
 def _predict(self, strings):
     sequences = [
         encode_sentencepiece(
             self._tokenizer.sp,
             string,
             return_unicode=False,
             sample=False,
         ) for string in strings
     ]
     batch_x = [self._tokenizer.encode(string) + [1] for string in strings]
     batch_x = padding_sequence(batch_x)
     r = self._execute(
         inputs=[batch_x],
         input_labels=['x_placeholder'],
         output_labels=['greedy', 'tag_greedy'],
     )
     p, tag = r['greedy'], r['tag_greedy']
     results = []
     nonzero = (p != 0).sum(axis=-1)
     for i in range(len(p)):
         r = self._tokenizer.decode(p[i].tolist())
         t = tag[i, :nonzero[i]]
         s = encode_sentencepiece(self._tokenizer.sp,
                                  r,
                                  return_unicode=False,
                                  sample=False)
         merged = merge_sentencepiece_tokens_tagging(s + ['<cls>'],
                                                     t,
                                                     model='xlnet')
         results.append(list(zip(merged[0], merged[1])))
     return results
示例#3
0
    def _summarize(self, strings, mode, decoder = 'greedy', top_p = 0.7):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        output = self._mapping.get(decoder)
        if not decoder:
            raise ValueError('mode only supports [`greedy`, `beam`, `nucleus`]')

        strings = [f'{mode}: {cleaning(string)}' for string in strings]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings]
        batch_x = padding_sequence(batch_x)

        p = self._sess.run(
            output, feed_dict = {self._X: batch_x, self._top_p: top_p}
        ).tolist()

        results = [
            postprocessing_summarization(self._tokenizer.decode(r)) for r in p
        ]
        return results
示例#4
0
 def _attention(self, strings):
     batch_x, _, _, s_tokens = bert_tokenization(
         self._tokenizer, strings, cls = self._cls, sep = self._sep
     )
     maxlen = max([len(s) for s in s_tokens])
     s_tokens = padding_sequence(s_tokens, maxlen, pad_int = self._sep)
     attentions = self._sess.run(self.attns, feed_dict = {self.X: batch_x})
     return attentions, s_tokens
示例#5
0
 def _attention(self, strings):
     batch_x, batch_masks, _, s_tokens = bert_tokenization(
         self._tokenizer, strings
     )
     maxlen = max([len(s) for s in s_tokens])
     s_tokens = padding_sequence(s_tokens, maxlen, pad_int = '[SEP]')
     attentions = self._sess.run(
         self.attns, feed_dict = {self.X: batch_x, self.MASK: batch_masks}
     )
     return attentions, s_tokens, batch_masks
示例#6
0
文件: __init__.py 项目: lkngin/Malaya
 def _attention(self, strings):
     input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
         self._tokenizer, strings)
     maxlen = max([len(s) for s in s_tokens])
     s_tokens = padding_sequence(s_tokens, maxlen, pad_int='<cls>')
     attentions = self._sess.run(
         self.attention_nodes,
         feed_dict={
             self.X: input_ids,
             self.segment_ids: segment_ids,
             self.input_masks: input_masks,
         },
     )
     return attentions, s_tokens, input_masks
示例#7
0
    def _summarize(
        self,
        strings,
        mode,
        decoder='greedy',
        top_p=0.7,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        if decoder not in ['greedy', 'beam', 'nucleus']:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings_ = [
            f'{mode}: {summarization_textcleaning(string)}'
            for string in strings
        ]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = padding_sequence(batch_x)

        r = self._execute(
            inputs=[batch_x, top_p],
            input_labels=['Placeholder', 'Placeholder_2'],
            output_labels=[decoder],
        )
        p = r[decoder].tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess and mode != 'tajuk':
                summary = postprocess_summary(strings[no], summary, **kwargs)

            results.append(summary)

        return results
示例#8
0
文件: tf.py 项目: illaiza115/malaya
    def _summarize(
        self,
        strings,
        mode,
        decoder='greedy',
        top_p=0.7,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        output = self._mapping.get(decoder)
        if not decoder:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings_ = [f'{mode}: {cleaning(string)}' for string in strings]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = padding_sequence(batch_x)

        p = self._sess.run(output,
                           feed_dict={
                               self._X: batch_x,
                               self._top_p: top_p
                           }).tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess:
                summary = filter_rouge(strings[no], summary, **kwargs)
                summary = postprocessing_summarization(summary)
                summary = find_lapor_and_remove(strings[no], summary)

            results.append(summary)

        return results
示例#9
0
文件: tf.py 项目: samsonleegh/Malaya
    def _paraphrase(self, strings, decoder = 'greedy', top_p = 0.7):

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        output = self._mapping.get(decoder)
        if not decoder:
            raise ValueError('mode only supports [`greedy`, `beam`, `nucleus`]')

        strings = [f'parafrasa: {cleaning(string)}' for string in strings]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings]
        batch_x = padding_sequence(batch_x)

        p = self._sess.run(
            output, feed_dict = {self._X: batch_x, self._top_p: top_p}
        ).tolist()

        results = [self._tokenizer.decode(r) for r in p]
        return results