Пример #1
0
class JapaneseTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, parser_type='MeCab'):
        self.parser_type = parser_type

        if self.parser_type == 'MeCab':
            import MeCab
            self.tokenizer = MeCab.Tagger('-Owakati')
        elif self.parser_type == 'janome':
            from janome.tokenizer import Tokenizer
            self.tokenizer = Tokenizer()
        else:
            raise ValueError("parser_type should be 'MeCab' or 'janome'")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.parser_type == 'MeCab':
            return self.tokenizer.parse(X).split(' ')[:-1]
        elif self.parser_type == 'janome':
            return self.tokenizer.tokenize(X, wakati=True)