예제 #1
0
def test_add_pre_analyzed_word():
    kiwi = Kiwi()
    ores = kiwi.tokenize("팅겼어")

    try:
        kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV"), "었/EP", "어/EF"])
        raise AssertionError("expected to raise `ValueError`")
    except ValueError:
        pass
    except:
        raise AssertionError("expected to raise `ValueError`")

    kiwi.add_user_word("팅기", "VV", orig_word="튕기")
    kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV", 0, 2), ("었", "EP", 1, 2),
                                       ("어", "EF", 2, 3)])

    res = kiwi.tokenize("팅겼어...")

    assert res[0].form == "팅기" and res[0].tag == "VV" and res[
        0].start == 0 and res[0].end == 2
    assert res[1].form == "었" and res[1].tag == "EP" and res[
        1].start == 1 and res[1].end == 2
    assert res[2].form == "어" and res[2].tag == "EF" and res[
        2].start == 2 and res[2].end == 3
    assert res[3].form == "..." and res[3].tag == "SF" and res[
        3].start == 3 and res[3].end == 6
예제 #2
0
def test_tokenize():
    kiwi = Kiwi()
    text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^"
    tokens = kiwi.tokenize(text, normalize_coda=True)
    print(tokens)

    tokens_by_sent = kiwi.tokenize(text, normalize_coda=True, split_sents=True)
    for tokens in tokens_by_sent:
        print(tokens)
예제 #3
0
def test_tokenize_with_stopwords():
    kiwi = Kiwi()
    stopwords = Stopwords()
    tokens = kiwi.tokenize("[^^ 우리는 강아지를 좋아한다.]", stopwords=stopwords)

    assert tokens[0].form == '강아지'
    assert tokens[1].form == '좋아하'
예제 #4
0
class KiwiModel(Model):
    def __init__(self):
        import kiwipiepy
        from kiwipiepy import Kiwi
        print("Initialize kiwipiepy ({})".format(kiwipiepy.__version__),
              file=sys.stderr)
        self._mdl = Kiwi()

    def _convert(self, morph):
        return morph.form, (morph.tag[:2]
                            if morph.tag.startswith('V') else morph.tag[:1])

    def _tokenize(self, text):
        return self._mdl.tokenize(text)
예제 #5
0
def baseline_splitter(text):
    import re
    sents = re.split(r'(?<=[.!?])\s', text)
    return sents


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('datasets', nargs='+')
    parser.add_argument('--write_result')
    parser.add_argument('--write_err')
    args = parser.parse_args()

    print('======== Baseline Splitter ========')
    for dataset in args.datasets:
        run_evaluate(dataset, baseline_splitter)

    print('======== Kiwi.split_into_sents ========')
    from kiwipiepy import Kiwi
    kiwi = Kiwi()
    kiwi.tokenize("foo-bar")  # warm-up
    for dataset in args.datasets:
        run_evaluate(
            dataset, lambda text: [
                sent.text
                for sent in kiwi.split_into_sents(text, normalize_coda=True)
            ], args.write_result, args.write_err)