Пример #1
0
 def tokenize(self, text):
     """
     :param text:
     :return:
     """
     text = convert_to_unicode(text)
     split_tokens = text.split(self.split_char)
     return split_tokens
Пример #2
0
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
            input = "unaffable"
            output = ["un", "##aff", "##able"]

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through `BasicTokenizer.

        Returns:
            A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocabulary.vocab_dict:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens
Пример #3
0
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        Returns:
            A list of wordpiece tokens.
        """
        text = text.lower() if self.do_lower_case else text
        text = convert_to_unicode(text)

        output_tokens = []
        for token in text.split(self.split_token):
            if token in self.vocab:
                output_tokens.append(token)
            else:
                sp_tokens = self.tokenizer.EncodeAsPieces(token)
                for sp_token in sp_tokens:
                    if sp_token in self.vocab:
                        output_tokens.append(sp_token)
        return output_tokens
Пример #4
0
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        Returns:
            A list of wordpiece tokens.
        """
        text = text.lower() if self.do_lower_case else text
        text = convert_to_unicode(text.replace("\1", " "))
        tokens = self.tokenizer.EncodeAsPieces(text)

        output_tokens = []
        for token in tokens:
            if token == self.sp_unk_token:
                token = self.unk_token

            if token in self.vocab:
                output_tokens.append(token)
            else:
                output_tokens.append(self.unk_token)

        return output_tokens
Пример #5
0
    def load_vocab(self):
        """
        :return:
        """
        vocab_dict = collections.OrderedDict()
        id_dict = collections.OrderedDict()
        file_vocab = open(self.vocab_path)
        for num, line in enumerate(file_vocab):
            items = convert_to_unicode(line.strip()).split("\t")
            if len(items) > 2:
                break
            token = items[0]
            if len(items) == 2:
                index = items[1]
            else:
                index = num
            token = token.strip()

            vocab_dict[token] = int(index)
            id_dict[index] = token

        return vocab_dict, id_dict
Пример #6
0
    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens
Пример #7
0
    def tokenize(self, text):
        """
        :param text:
        :return:
        """
        text = convert_to_unicode(text)
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start == 0:
                        substr = u'\u2581' + substr
                    if substr in self.vocabulary.vocab_dict:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens