示例#1
0
        return [self.tokenize(text) for text in texts]

    #overrides
    def tokenize(self, text):
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [
                Token(text_id=c + 1) for c in text.encode(self._byte_encoding)
            ]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens


CharacterTokenizer = Tokenizer.register(u"character")(CharacterTokenizer)
示例#2
0
    #overrides
    def tokenize(self, text):
        u"""
        Does whatever processing is required to convert a string of text into a sequence of tokens.

        At a minimum, this uses a ``WordSplitter`` to split words into text.  It may also do
        stemming or stopword removal, depending on the parameters given to the constructor.
        """
        words = self._word_splitter.split_words(text)
        return self._filter_and_stem(words)

    #overrides
    def batch_tokenize(self, texts):
        batched_words = self._word_splitter.batch_split_words(texts)
        return [self._filter_and_stem(words) for words in batched_words]

    def _filter_and_stem(self, words):
        filtered_words = self._word_filter.filter_words(words)
        stemmed_words = [
            self._word_stemmer.stem_word(word) for word in filtered_words
        ]
        for start_token in self._start_tokens:
            stemmed_words.insert(0, Token(start_token, 0))
        for end_token in self._end_tokens:
            stemmed_words.append(Token(end_token, -1))
        return stemmed_words


WordTokenizer = Tokenizer.register(u"word")(WordTokenizer)