예제 #1
0
def establish_qh_dict():
    qh_typed_words = get_qh_typed_words()
    entity_type_dict = {}
    for word_type in ['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']:
        for word in qh_typed_words[word_type]:
            entity_type_dict[word] = "其他专名"
    print("清华词典构建完毕")
    return entity_type_dict
예제 #2
0
def using_typed_words():
    from harvesttext.resources import get_qh_typed_words, get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型", typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence, stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
예제 #3
0
def test_using_typed_words():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型",typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence,stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected