from entity_verb.nlp import NLP import json f =open('entity_verb_result\\' + "all_entity.json" , 'r', encoding='utf-8') file = f.read() all_entity = json.loads(file)['all_entity'] f.close() nlp =NLP() postag_dict = dict() for word in all_entity: postage = nlp.get_postag(word) if postage not in postag_dict.keys(): postag_dict[postage] = [word] else: postag_dict[postage].append(word) print(postag_dict) for i in postag_dict.keys(): postag_dict_new = dict() postag_dict_new[i+'_'+str(len(postag_dict[i]))] = list(postag_dict[i]) with open("entity_verb_result\\entity_classification_LTP.json", 'a') as f_out: f_out.write(json.dumps(postag_dict_new,ensure_ascii=False)) f_out.write("\n") # print(all_entity)
if __name__ == "__main__": # 读取文件 entity_verb_new = entity_verb_new() """ 加载LTP的分词器和词性标注器 """ default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 segmentor = Segmentor() user_dict = "source\\user.txt" segmentor_flag = segmentor.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) postagger = Postagger() postag_flag = postagger.load(os.path.join(default_model_dir, 'pos.model')) nlp = NLP() thu1 = thulac.thulac() # 默认模式 path = r"D:\python-file\北京市旅游知识图谱\\verb-entity\\bj_travel" file_list = os.listdir(path) f = open('entity_verb_result\\' + "all_entity.json", 'r', encoding='utf-8') file = f.read() all_entity = json.loads(file)['all_entity'] print(all_entity) f.close() for file_name in file_list: print(file_name) f = open('D:\python-file\北京市旅游知识图谱\\verb-entity\\bj_travel\\' + file_name, 'r', encoding='utf-8')