Exemplo n.º 1
0
def basic_example_kytea_2x():
    # input is `unicode` type(in python2x)
    sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    kytea_wrapper = KyteaWrapper()
    tokenized_objects = kytea_wrapper.tokenize(sentence=sentence,
                                               normalize=True,
                                               return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print(u'Kytea Demo')
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        # kytea does not show word stem, thus word_stem attribute is always null string
        # instead kytea tells you inferred Yomi, pronounciation
        print(u'word_surafce:{}, pos:{}, yomi:{}, yomi_score:{}'.format(
            token_object.word_surface,
            token_object.tuple_pos,
            token_object.misc_info['yomi'],
            token_object.misc_info['yomi_score'],
        ))
Exemplo n.º 2
0
class KyteaTokenizer:
    def __init__(self):
        self.tokenizer = KyteaWrapper()

    def tokenize(self, text):
        tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects
        return [
            dict(analyzed_line=obj.analyzed_line,
                 word_surface=obj.word_surface,
                 word_stem=obj.word_stem,
                 pos=list(obj.tuple_pos),
                 misc_info=obj.misc_info) for obj in tokenized_objects
        ]
def basic_example_kytea_2x():
    # input is `unicode` type(in python2x)
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    kytea_wrapper = KyteaWrapper()
    tokenized_objects = kytea_wrapper.tokenize(
        sentence=sentence,
        normalize=True,
        return_list=False
    )
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-'*30)
    print(u'Kytea Demo')
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        # kytea does not show word stem, thus word_stem attribute is always null string
        # instead kytea tells you inferred Yomi, pronounciation
        print(u'word_surafce:{}, pos:{}, yomi:{}, yomi_score:{}'.format(
            token_object.word_surface,
            token_object.tuple_pos,
            token_object.misc_info['yomi'],
            token_object.misc_info['yomi_score'],
        ))
Exemplo n.º 4
0
 def __init__(self):
     self.tokenizer = KyteaWrapper()