예제 #1
0
    def _predict(self, strings, beam_search=True):
        if self._translation_model:
            encoded = [
                self._encoder.encode(translation_textcleaning(string)) + [1]
                for string in strings
            ]
        else:
            encoded = self._encoder.encode(strings)
        batch_x = pad_sentence_batch(encoded, 0)[0]

        if beam_search:
            output = 'beam'
        else:
            output = 'greedy'

        r = self._execute(
            inputs=[batch_x],
            input_labels=['Placeholder'],
            output_labels=[output],
        )
        p = r[output].tolist()
        if self._translation_model:
            result = []
            for row in p:
                result.append(
                    self._encoder.decode([i for i in row if i not in [0, 1]]))
        else:
            result = self._encoder.decode(p)
        return result
예제 #2
0
파일: tf.py 프로젝트: huseinzol05/malaya
    def predict(self, strings: List[str], beam_search: bool = False):
        """
        Convert to target strings.

        Parameters
        ----------
        strings : List[str]
        beam_search : bool, (optional=False)
            If True, use beam search decoder, else use greedy decoder.

        Returns
        -------
        result: List[str]
        """
        if beam_search:
            output = 'beam'
        else:
            output = 'greedy'

        batch = [[
            self._left_dict[c] for c in self._cleaning(string, self._left_dict)
        ] + [1] for string in strings]
        batch = pad_sentence_batch(batch, 0)[0]
        r = self._execute(
            inputs=[batch],
            input_labels=['Placeholder'],
            output_labels=[output],
        )
        v = r[output]
        results = [
            ''.join([self._rev_right_dict[i] for i in r if i > 3]) for r in v
        ]

        return results
예제 #3
0
    def stem(self, string: str, beam_search: bool = False):
        """
        Stem a string, this also include lemmatization.

        Parameters
        ----------
        string : str
        beam_search : bool, (optional=False)
            If True, use beam search decoder, else use greedy decoder.

        Returns
        -------
        result: str
        """

        tokenized = self._tokenizer(string)
        result, batch, actual, mapping = [], [], [], {}
        for no, word in enumerate(tokenized):
            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
            elif (re.findall(_money, word.lower())
                  or re.findall(_date, word.lower())
                  or re.findall(_expressions['time'], word.lower())
                  or re.findall(_expressions['hashtag'], word.lower())
                  or re.findall(_expressions['url'], word.lower())
                  or re.findall(_expressions['user'], word.lower())):
                result.append(word)
            else:
                mapping[len(batch)] = no
                result.append('REPLACE-ME')
                actual.append(word)
                batch.append(word.lower())

        if len(batch):

            batch = self._bpe.bpe.encode(batch, output_type=self._bpe.mode)

            batch = [i + [1] for i in batch]
            batch = pad_sentence_batch(batch, 0)[0]

            if beam_search:
                output = 'beam'
            else:
                output = 'greedy'

            r = self._execute(
                inputs=[batch],
                input_labels=['Placeholder'],
                output_labels=[output],
            )
            output = r[output].tolist()

            for no, o in enumerate(output):
                predicted = list(dict.fromkeys(o))
                predicted = (self._bpe.bpe.decode(predicted)[0].replace(
                    '<EOS>', '').replace('<PAD>', ''))
                predicted = case_of(actual[no])(predicted)
                result[mapping[no]] = predicted

        return ' '.join(result)
예제 #4
0
    def stem(self, string: str, beam_search: bool = True):
        """
        Stem a string.

        Parameters
        ----------
        string : str
        beam_search : bool, (optional=True)
            If True, use beam search decoder, else use greedy decoder.

        Returns
        -------
        result: str
        """
        tokenized = self._tokenizer(string)
        result, batch, actual, mapping = [], [], [], {}
        for no, word in enumerate(tokenized):
            if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
                result.append(word)
            elif (
                re.findall(_money, word.lower())
                or re.findall(_date, word.lower())
                or re.findall(_expressions['time'], word.lower())
                or re.findall(_expressions['hashtag'], word.lower())
                or re.findall(_expressions['url'], word.lower())
                or re.findall(_expressions['user'], word.lower())
            ):
                result.append(word)
            else:
                mapping[len(batch)] = no
                result.append('REPLACE-ME')
                actual.append(word)
                batch.append(word.lower())

        if len(batch):

            batch = self._bpe.encode(batch, output_type = self._subword_mode)

            batch = [i + [1] for i in batch]
            batch = pad_sentence_batch(batch, 0)[0]

            if beam_search:
                output = self._beam
            else:
                output = self._greedy

            output = self._sess.run(output, feed_dict = {self._X: batch})
            output = output.tolist()

            for no, o in enumerate(output):
                predicted = list(dict.fromkeys(o))
                predicted = self._bpe.decode(predicted)[0].replace('<EOS>', '')
                predicted = case_of(actual[no])(predicted)
                result[mapping[no]] = predicted

        return ' '.join(result)
예제 #5
0
파일: tf.py 프로젝트: samsonleegh/Malaya
 def _true_case(self, strings, beam_search = True):
     encoded = self._encoder.encode(strings)
     if beam_search:
         output = self._beam
     else:
         output = self._greedy
     batch_x = pad_sentence_batch(encoded, 0)[0]
     p = self._sess.run(output, feed_dict = {self._X: batch_x}).tolist()
     result = self._encoder.decode(p)
     return result
예제 #6
0
 def _paraphrase(self, strings, beam_search = True):
     encoded = [self._tokenizer.encode(string) + [1] for string in strings]
     if beam_search:
         output = self._beam
     else:
         output = self._greedy
     batch_x = pad_sentence_batch(encoded, 0)[0]
     p = self._sess.run(output, feed_dict = {self._X: batch_x}).tolist()
     result = []
     for row in p:
         result.append(
             self._tokenizer.decode([i for i in row if i not in [0, 1]])
         )
     return result
예제 #7
0
파일: tf.py 프로젝트: huseinzol05/malaya
    def _predict(self, strings, beam_search=True):
        encoded = [[
            self._left_dict[c] for c in self._cleaning(string, self._left_dict)
        ] + [1] for string in strings]
        batch_x = pad_sentence_batch(encoded, 0)[0]

        if beam_search:
            output = 'beam'
        else:
            output = 'greedy'

        r = self._execute(
            inputs=[batch_x],
            input_labels=['Placeholder'],
            output_labels=[output],
        )
        v = r[output]
        results = [
            ''.join([self._rev_left_dict[i] for i in r if i > 3]) for r in v
        ]

        return results
예제 #8
0
    def stem(self, string: str):
        """
        Stem a string.

        Parameters
        ----------
        string : str

        Returns
        -------
        string: stemmed string
        """
        token_strings = classification_textcleaning(string, True).split()
        idx = stemmer_str_idx(token_strings, self._dicts['dictionary_from'])
        predicted = self._sess.run(
            self._logits, feed_dict={self._x: pad_sentence_batch(idx, PAD)[0]})
        results = []
        for word in predicted:
            results.append(''.join([
                self._dicts['rev_dictionary_to'][c] for c in word
                if c not in [GO, PAD, EOS, UNK]
            ]))
        return ' '.join(results)