示例#1
0
 def from_params(cls, params: Params) -> 'WordTokenizer':
     word_splitter = WordSplitter.from_params(params.pop('word_splitter', {}))
     word_filter = WordFilter.from_params(params.pop('word_filter', {}))
     word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {}))
     start_tokens = params.pop('start_tokens', None)
     end_tokens = params.pop('end_tokens', None)
     params.assert_empty(cls.__name__)
     return cls(word_splitter=word_splitter,
                word_filter=word_filter,
                word_stemmer=word_stemmer,
                start_tokens=start_tokens,
                end_tokens=end_tokens)
示例#2
0
 def from_params(cls, params: Params) -> 'WordTokenizer':
     word_splitter = WordSplitter.from_params(params.pop('word_splitter', {}))
     word_filter = WordFilter.from_params(params.pop('word_filter', {}))
     word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {}))
     start_tokens = params.pop('start_tokens', None)
     end_tokens = params.pop('end_tokens', None)
     params.assert_empty(cls.__name__)
     return cls(word_splitter=word_splitter,
                word_filter=word_filter,
                word_stemmer=word_stemmer,
                start_tokens=start_tokens,
                end_tokens=end_tokens)
示例#3
0
 def from_params(cls, params: Params) -> 'BPETokenizer':
     merges = params.pop('merges', None)
     bpe_vocab = params.pop('bpe_vocab', None)
     word_split = params.pop('word_split', False)
     word_splitter = WordSplitter.from_params(
         params.pop('word_splitter', {}))
     lowercase_tokens = params.pop('lowercase_tokens', False)
     start_tokens = params.pop('start_tokens', None)
     end_tokens = params.pop('end_tokens', None)
     params.assert_empty(cls.__name__)
     return cls(merges=merges,
                bpe_vocab=bpe_vocab,
                word_split=word_split,
                word_splitter=word_splitter,
                lowercase_tokens=lowercase_tokens,
                start_tokens=start_tokens,
                end_tokens=end_tokens)
示例#4
0
    def from_params(cls, params: Params) -> 'WordTokenizer':
        """
        Parameters
        ----------
        word_splitter : ``str``, default=``"simple"``
            The string name of the ``WordSplitter`` of choice (see the options at the bottom of
            ``word_splitter.py``).

        word_filter : ``str``, default=``"pass_through"``
            The name of the ``WordFilter`` to use (see the options at the bottom of
            ``word_filter.py``).

        word_stemmer : ``str``, default=``"pass_through"``
            The name of the ``WordStemmer`` to use (see the options at the bottom of
            ``word_stemmer.py``).
        """
        word_splitter = WordSplitter.from_params(
            params.pop('word_splitter', {}))
        word_filter = WordFilter.from_params(params.pop('word_filter', {}))
        word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {}))
        params.assert_empty(cls.__name__)
        return cls(word_splitter=word_splitter,
                   word_filter=word_filter,
                   word_stemmer=word_stemmer)
示例#5
0
    def test_no_constructor(self):
        params = Params({"type": "just_spaces"})

        WordSplitter.from_params(params)
示例#6
0
    def test_no_constructor(self):
        params = Params({"type": "just_spaces"})

        WordSplitter.from_params(params)
示例#7
0
 def tokenizer(x:str, splitter:WordSplitter=None)->List[str]:
     return [xe.text for xe in splitter.split_words(x)]