Exemplo n.º 1
0
def test_find_with_rules():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
    # some more patterns is provided
    text0 = "我喜欢Python,因为requests库很适合爬虫"
    ht0 = HarvestText()

    found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名")
    print(found_entities)
    print(ht0.posseg(text0))
    print(ht0.mention2entity("Python"))


    # Satisfying one of the rules
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0,rulesets=[AllEnglish(),Contains("爬")],type0="技术")
    print(found_entities)
    print(ht0.posseg(text0))

    # Satisfying a couple of rules [using tuple]
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(),UpperFirst())], type0="专有英文词")
    print(found_entities)
    print(ht0.posseg(text0))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Exemplo n.º 2
0
def find_with_rules():
    from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
    # some more patterns is provided
    text0 = "我喜欢Python,因为requests库很适合爬虫"
    ht0 = HarvestText()

    found_entities = ht0.find_entity_with_rule(text0,
                                               rulesets=[AllEnglish()],
                                               type0="英文名")
    print(found_entities)
    print(ht0.posseg(text0))
    print(ht0.mention2entity("Python"))

    # Satisfying one of the rules
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(
        text0, rulesets=[AllEnglish(), Contains("爬")], type0="技术")
    print(found_entities)
    print(ht0.posseg(text0))

    # Satisfying a couple of rules [using tuple]
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0,
                                               rulesets=[(AllEnglish(),
                                                          UpperFirst())],
                                               type0="专有英文词")
    print(found_entities)
    print(ht0.posseg(text0))
Exemplo n.º 3
0
def using_typed_words():
    from harvesttext.resources import get_qh_typed_words, get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型", typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence, stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
Exemplo n.º 4
0
def test_using_typed_words():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型",typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence,stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Exemplo n.º 5
0
def test_english():
    # ♪ "Until the Day" by JJ Lin
    test_text = """
    In the middle of the night. 
    Lonely souls travel in time.
    Familiar hearts start to entwine.
    We imagine what we'll find, in another life.  
    """.lower()
    ht_eng = HarvestText(language="en")
    sentences = ht_eng.cut_sentences(test_text)
    print("\n".join(sentences))
    print(ht_eng.seg(sentences[-1]))
    print(ht_eng.posseg(sentences[0], stopwords={"in"}))
    sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=["familiar"], neg_seeds=["lonely"],
                                       min_times=1, stopwords={'in', 'to'})
    print("Sentiment analysis")
    for sent0 in sentences:
        print(sent0, "%.3f" % ht_eng.analyse_sent(sent0))
    print("Segmentation")
    print("\n".join(ht_eng.cut_paragraphs(test_text, num_paras=2)))