class JapaneseTokenizer(BaseEstimator, TransformerMixin): def __init__(self, parser_type='MeCab'): self.parser_type = parser_type if self.parser_type == 'MeCab': import MeCab self.tokenizer = MeCab.Tagger('-Owakati') elif self.parser_type == 'janome': from janome.tokenizer import Tokenizer self.tokenizer = Tokenizer() else: raise ValueError("parser_type should be 'MeCab' or 'janome'") def fit(self, X, y=None): return self def transform(self, X): if self.parser_type == 'MeCab': return self.tokenizer.parse(X).split(' ')[:-1] elif self.parser_type == 'janome': return self.tokenizer.tokenize(X, wakati=True)