예제 #1
0
 def _summarize(self, string, mode, postprocess, **kwargs):
     summary = upperfirst(
         self._predict(f'{mode}: {summarization_textcleaning(string)}')
     )
     if postprocess and mode != 'tajuk':
         summary = postprocess_summary(string, summary, **kwargs)
     return summary
예제 #2
0
파일: pegasus.py 프로젝트: lantip/Malaya
    def _summarize(
        self,
        strings,
        top_p=0.7,
        temperature=1.0,
        postprocess=True,
        **kwargs,
    ):

        strings_ = [summarization_textcleaning(string) for string in strings]
        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = pad_sequences(batch_x, padding='post')

        r = self._execute(
            inputs=[batch_x, top_p, temperature],
            input_labels=['Placeholder', 'top_p', 'temperature'],
            output_labels=['logits'],
        )
        p = r['logits'].tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess:
                summary = postprocess_summary(strings[no], summary, **kwargs)

            results.append(summary)

        return results
예제 #3
0
 def _summarize(self, strings, mode, postprocess, **kwargs):
     summaries = self._predict([
         f'ringkasan: {summarization_textcleaning(string)}'
         for string in strings
     ])
     if postprocess and mode != 'tajuk':
         summaries = [
             postprocess_summary(strings[no], summary, **kwargs)
             for no, summary in enumerate(summaries)
         ]
     return summaries
예제 #4
0
    def _summarize(
        self,
        strings,
        mode,
        decoder='greedy',
        top_p=0.7,
        postprocess=True,
        **kwargs,
    ):
        mode = mode.lower()
        if mode not in ['ringkasan', 'tajuk']:
            raise ValueError('mode only supports [`ringkasan`, `tajuk`]')

        if not 0 < top_p < 1:
            raise ValueError('top_p must be bigger than 0 and less than 1')

        decoder = decoder.lower()
        if decoder not in ['greedy', 'beam', 'nucleus']:
            raise ValueError(
                'mode only supports [`greedy`, `beam`, `nucleus`]')

        strings_ = [
            f'{mode}: {summarization_textcleaning(string)}'
            for string in strings
        ]

        batch_x = [self._tokenizer.encode(string) + [1] for string in strings_]
        batch_x = padding_sequence(batch_x)

        r = self._execute(
            inputs=[batch_x, top_p],
            input_labels=['Placeholder', 'Placeholder_2'],
            output_labels=[decoder],
        )
        p = r[decoder].tolist()

        results = []
        for no, r in enumerate(p):
            summary = self._tokenizer.decode(r)
            if postprocess and mode != 'tajuk':
                summary = postprocess_summary(strings[no], summary, **kwargs)

            results.append(summary)

        return results