def from_params(cls, params: Params) -> 'WordTokenizer': word_splitter = WordSplitter.from_params(params.pop('word_splitter', {})) word_filter = WordFilter.from_params(params.pop('word_filter', {})) word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {})) start_tokens = params.pop('start_tokens', None) end_tokens = params.pop('end_tokens', None) params.assert_empty(cls.__name__) return cls(word_splitter=word_splitter, word_filter=word_filter, word_stemmer=word_stemmer, start_tokens=start_tokens, end_tokens=end_tokens)
def from_params(cls, params: Params) -> 'WordTokenizer': word_splitter = WordSplitter.from_params(params.pop('word_splitter', {})) word_filter = WordFilter.from_params(params.pop('word_filter', {})) word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {})) start_tokens = params.pop('start_tokens', None) end_tokens = params.pop('end_tokens', None) params.assert_empty(cls.__name__) return cls(word_splitter=word_splitter, word_filter=word_filter, word_stemmer=word_stemmer, start_tokens=start_tokens, end_tokens=end_tokens)
def from_params(cls, params: Params) -> 'BPETokenizer': merges = params.pop('merges', None) bpe_vocab = params.pop('bpe_vocab', None) word_split = params.pop('word_split', False) word_splitter = WordSplitter.from_params( params.pop('word_splitter', {})) lowercase_tokens = params.pop('lowercase_tokens', False) start_tokens = params.pop('start_tokens', None) end_tokens = params.pop('end_tokens', None) params.assert_empty(cls.__name__) return cls(merges=merges, bpe_vocab=bpe_vocab, word_split=word_split, word_splitter=word_splitter, lowercase_tokens=lowercase_tokens, start_tokens=start_tokens, end_tokens=end_tokens)
def from_params(cls, params: Params) -> 'WordTokenizer': """ Parameters ---------- word_splitter : ``str``, default=``"simple"`` The string name of the ``WordSplitter`` of choice (see the options at the bottom of ``word_splitter.py``). word_filter : ``str``, default=``"pass_through"`` The name of the ``WordFilter`` to use (see the options at the bottom of ``word_filter.py``). word_stemmer : ``str``, default=``"pass_through"`` The name of the ``WordStemmer`` to use (see the options at the bottom of ``word_stemmer.py``). """ word_splitter = WordSplitter.from_params( params.pop('word_splitter', {})) word_filter = WordFilter.from_params(params.pop('word_filter', {})) word_stemmer = WordStemmer.from_params(params.pop('word_stemmer', {})) params.assert_empty(cls.__name__) return cls(word_splitter=word_splitter, word_filter=word_filter, word_stemmer=word_stemmer)
def test_no_constructor(self): params = Params({"type": "just_spaces"}) WordSplitter.from_params(params)
def test_no_constructor(self): params = Params({"type": "just_spaces"}) WordSplitter.from_params(params)
def tokenizer(x:str, splitter:WordSplitter=None)->List[str]: return [xe.text for xe in splitter.split_words(x)]