Пример #1
0
 def test_stems_and_filters_correctly(self):
     word_processor = WordTokenizer.from_params(
         Params({
             'word_stemmer': {
                 'type': 'porter'
             },
             'word_filter': {
                 'type': 'stopwords'
             }
         }))
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     expected_tokens = ["sentenc", "ha", "crazi", "punctuat"]
     tokens = word_processor.tokenize(sentence)
     assert tokens == expected_tokens
Пример #2
0
 def from_params(cls, params: Params) -> 'NgramWordsIndexer':
     """
     Parameters
     ----------
     namespace : ``str``, optional (default=``shared_words_vocab``)
         We will use this namespace in the :class:`Vocabulary` to map the words in each token
         to indices.
     word_tokenizer : `WordTokenizer`, optional (default=`WordTokenizer(word_splitter=JustSpacesWordSplitter())`)
         Defines the way we split ngram to words. Default is just to split by space.
     """
     namespace = params.pop('namespace', 'shared_words_vocab')
     word_tokenizer_params = params.pop('word_tokenizer', {})
     word_tokenizer = WordTokenizer.from_params(word_tokenizer_params)
     params.assert_empty(cls.__name__)
     return cls(namespace=namespace, word_tokenizer=word_tokenizer)