def eventExtraction(self, request, file_id, lEventCategoryId): """ 功能 进行模板匹配的事件抽取 :param request: request参数 :param file_id: 数据类型str 文件id :param lEventCategoryId: 数据类型list 事件类目id :return: True """ #加入ruleId 1或者2 #1的事件是三元组主谓宾 2的话变事件是主谓 #only for debug #request.session['user_id'] = 1 #request.session['repo_id'] = 1 #fileId = 13 #only for debug #fileId = request.POST['fileId'] #request.session['repo_id']=1 #request.session['user_id']=1 repoId = request.session['repo_id'] createId = request.session['user_id'] #存到这个file_id 里面 tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId} news_col = Mongodb(db='knowledge', collection='text').get_collection() cnt = 1 ret_entity = news_col.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0: return print("--------------------事件抽取") #在这个之前把所有的词语都加进去 #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去 retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId) eventLabelList = [] # hanlpUnit=HanlpUnit() #这边要修 我们要从事类目开始查询 for i in retTriggerWordList: tmpLableList = [] ruleId = 1 retTriggerWordDict = model_to_dict(i) triggerId = retTriggerWordDict['id'] eventId = retTriggerWordDict['event_rule_id'] #print(111,eventId) #触发词名字和触发词标注 retEventRule = TEventRule.objects.get(id=eventId) #print(333,retEventRule.category_id) retCategoryName = TCategory.objects.get( id=retEventRule.category_id).category_name #print(444,retCategoryName) #这里的时候触发词的label要变成事件的label #到时候改一下 triggerWord = retTriggerWordDict['trigger_word'] triggerWordId = BaseController.get_category_name( request, retCategoryName) #print(222,eventId) eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId) eventRuleDict = model_to_dict(eventRule) eventCategoryId = eventRuleDict['category_id'] if (eventCategoryId not in lEventCategoryId): continue eventCategory = TCategory.objects.get(id=eventCategoryId, repo_id=repoId, create_id=createId) eventCategoryDict = model_to_dict(eventCategory) eventCategoryName = eventCategoryDict['category_name'] tmpLableList.append(eventCategoryName) #事件类目 subjectCategoryId = eventRuleDict['event_subject_id'] subjectCategory = TCategory.objects.get(id=subjectCategoryId, repo_id=repoId, create_id=createId) subjectCategoryDict = model_to_dict(subjectCategory) subjectCategoryName = subjectCategoryDict['category_name'] subjectId = BaseController.get_category_name( request, subjectCategoryName) tmpLableList.append(subjectId) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId) #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词 #构造wordList word 和mask 对应 constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #print(len(retListVal )) for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word #print(word) #item["word"], item["mask"] tmpDict['mask'] = subjectId constructWordList.append(tmpDict) #这边这个要加入list[{'word':123,mask:13}] self.hanlp_tool.add_word_list(constructWordList) #print(constructWordList) objectCategoryId = eventRuleDict['event_object_id'] negativeOne = -1 if (objectCategoryId == negativeOne): ruleId = 2 constructWordList = [] tmpDict = {} tmpDict['word'] = triggerWord tmpDict['mask'] = str(triggerWordId) tmpSet = self.hanlp_tool.added_word_list constructWordList.append(tmpDict) self.hanlp_tool.add_word_list(constructWordList) tmpLableList.append(str(triggerWordId)) print(ruleId) if (ruleId == 1): objectCategoryId = eventRuleDict['event_object_id'] objectCategory = TCategory.objects.get(id=objectCategoryId, repo_id=repoId, create_id=createId) objectCategoryDict = model_to_dict(objectCategory) objectCategoryName = objectCategoryDict['category_name'] objectId = BaseController.get_category_name( request, objectCategoryName) retListId, retListVal = some_data_deal_func( ).inputCategoryIdReturnName(objectCategoryId, repoId, createId) tmpLableList.append(objectId) constructWordList = [] tmpSet = self.hanlp_tool.added_word_list #这个代码有变动需要改一下 for word in retListVal: if (word == None): continue tmpDict = {} tmpDict['word'] = word # item["word"], item["mask"] tmpDict['mask'] = str(objectId) constructWordList.append(tmpDict) # 这边这个要加入list[{'word':123,mask:13}] #print(constructWordList) self.hanlp_tool.add_word_list(constructWordList) eventLabelList.append(tmpLableList) #eventLabelList #事件类目 事件主题 事件触发词 事件客体 #print(eventLabelList) # print("list里面内容") # tmpS=self.hanlp_tool.added_word_list # for name in tmpS: # print(name) #print("list里面内容结束") #return True #name attribute = TAttribute.objects.get(category_id=1) attributeDict = model_to_dict(attribute) attributeName = attributeDict['attribute_name'] #print(self.hanlp_tool.added_word_list) cnt = 1 for i in ret_entity_map: _id = i['_id'] #根据这个id放回去就好了 value = i['value'] basetime = str(value['时间']) content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) #print(sentenceList) #这边把所有的东西都拿出来 event_extract_result = [] count = 0 countIndex = 0 #时间 地点 事件主体 事件客体 主体的类目 和客体的类目 tmpEventSet = set() for sent in sentenceList: sent = sent.strip() #print(sent) #对每一个sent进行分词获取他们的事件 #11111 #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》" sentenceDealResult = self.hanlp_tool.cut(sent) event = self.eventExtractionByTemplateMatching( sent, eventLabelList) #事件抽取完成 #dateTime还要调整一下basetime会出问题 #print(basetime) dateTime = basetime timeIndex = -1 #print(123,timeIndex) timeIndex, timeWord, dateTime = Time_deal().dealTime( sent, basetime) if (timeIndex != -1): timeIndex = timeIndex + countIndex #print(46, timeIndex) #print(11111111,dateTime) locationList = Time_deal().dealArea(sent) location = '' locationindex = -1 for val in locationList: if (len(val['place']) > len(location)): location = val['place'] locationindex = val['index'] + countIndex #print(location,locationindex) countIndex += len(sentenceDealResult) #这三个的名字需要和事件一起返回 #print(event) for eve in event: ruleId = 1 if (len(eve) == 3): ruleId = 2 eveId = eve[0] subjectLabel = eventLabelList[eveId][1] #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0] attribute = {} attribute['发生时间'] = dateTime attribute['地点'] = location eveString = '' for j in range(1, len(eve), 1): eveString = eveString + str(eve[j]) attribute['名字'] = eveString #eventlabel要通过查询结果得到 eventLabel = BaseController.get_category_name( request, eventLabelList[eveId][0]) #print(eventLabel) #print(eventLabelList[eveId]) #print(event) subjectLabel = eventLabelList[eveId][1] Neo4j().create_node_mjy_edition(eventLabel, attribute) subjectNameVal = eve[1] # print(subjectCategoryName,attributeName,subjectNameVal) neo4jSubjectId = Neo4j().quesIdByLabelAttribute( subjectLabel, attributeName, '\'' + subjectNameVal + '\'') neo4jEventId = Neo4j().quesIdByLabelAttribute( eventLabel, '名字', '\'' + eveString + '\'') Neo4j().createRelationship(subjectLabel, eventLabel, "主谓关系", {'id': neo4jSubjectId}, {'id': neo4jEventId}) if (ruleId == 1): objectNameVal = eve[3] objectLabel = eventLabelList[eveId][3] neo4jObjectId = Neo4j().quesIdByLabelAttribute( objectLabel, attributeName, '\'' + objectNameVal + '\'') Neo4j().createRelationship(eventLabel, objectLabel, "动宾关系", {'id': neo4jEventId}, {'id': neo4jObjectId}) #print(neo4jSubjectId, neo4jEventId, neo4jObjectId) tmpEventDict = {} tmpEventDict['actual_event_time'] = dateTime #事件抽取内容拿出来 tmpEventDict['time'] = timeWord tmpEventDict['timeIndex'] = timeIndex tmpEventDict['location'] = location tmpEventDict['locationIndex'] = locationindex #print(111,dateTime,location) tmpEventDict['eventSubject'] = eve[1] tmpEventDict['eventSubjectLabel'] = subjectLabel tmpEventDict['triggerLabel'] = eventLabel tmpEventDict['triggerWord'] = eve[2] tmpEventDict['eventName'] = eveString if (ruleId == 1): tmpEventDict['eventObject'] = eve[3] objectLabel = eventLabelList[eveId][3] tmpEventDict['eventObjectLabel'] = objectLabel if (eveString not in tmpEventSet): tmpEventSet.add(eveString) event_extract_result.append(tmpEventDict) print(tmpEventDict) count += 1 #插入到mongodb #print(count,event_extract_result) news_col.update_one( {'_id': _id}, {"$set": { 'event_extract_result': event_extract_result }}) #news_col.insert_one() cnt += 1 #if(cnt>=2): # break return True
def extract_relationship_from_unstructured_data( self, request, file_id, relationship_attribute_list=None): """ 从非结构化数据中抽取关系 :param file_id:文件id,获取mongodb中对应要分析的数据 :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合 :param request: :return: """ print("------------------------非结构关系抽取") tmp_info = { 'file_id': file_id, 'user_id': request.session["user_id"], 'repo_id': request.session["repo_id"] } collection = Mongodb(db='knowledge', collection='text').get_collection() ret_entity = collection.find(tmp_info) ret_entity_map = list() for item in ret_entity: if "内容" in item["value"]: ret_entity_map.append(item) if len(ret_entity_map) == 0 or relationship_attribute_list is None: print("无可抽取内容") return relationship_list = [] # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1) added_category_id = set() for attribute_id in relationship_attribute_list: cur_attribute = TAttribute.objects.get(id=attribute_id) category_from = TCategory.objects.get(id=cur_attribute.category_id) data_type = TDataType.objects.get(id=cur_attribute) category_to = TCategory.objects.get(id=data_type.category_id) category_from_name = BaseController.get_category_name( request, category_from.category_name) category_to_name = BaseController.get_category_name( request, category_to.category_name) one_relationship = list() one_relationship.append(cur_attribute.attribute_name) one_relationship.append(category_from_name) one_relationship.append( BaseController.get_category_name(request, cur_attribute.attribute_name)) one_relationship.append(category_to_name) relationship_list.append(one_relationship) self.hanlp_tool.add_word_list([{ "word": alia_item.attribute_alias, "mask": BaseController.get_category_name(request, cur_attribute.attribute_name) } for alia_item in TAttrbuteAlias.objects.filter( attribute_id=cur_attribute.id)]) if category_from.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_from_name } for val_item in ret_list_val]) added_category_id.add(category_from.id) if category_to.id not in added_category_id: ret_list_id, ret_list_val = some_data_deal_func( ).inputCategoryIdReturnName( categoryId=category_to.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) self.hanlp_tool.add_word_list([{ "word": val_item, "mask": category_to_name } for val_item in ret_list_val]) added_category_id.add(category_to.id) # for category_item in all_category: # try: # one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"]) # attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id) # category_to_name = BaseController.get_category_name(request, category_item.category_name) # for attribute_item in attribute_list: # category_from = TCategory.objects.get(id=attribute_item.category_id) # category_from_name = BaseController.get_category_name(request, category_from.category_name) # one_relationship = list() # one_relationship.append(attribute_item.attribute_name) # one_relationship.append(category_from_name) # one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name)) # one_relationship.append(category_to_name) # relationship_list.append(one_relationship) # self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # print([{"word": alia_item.attribute_alias, # "mask": BaseController.get_category_name(request, # attribute_item.attribute_name)} # for alia_item in # TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)]) # if category_from.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"]) # self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val]) # added_category_id.add(category_from.id) # if category_item.id not in added_category_id: # ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName( # categoryId=category_item.id, repoId=request.session["repo_id"], # createId=request.session["user_id"]) # self.hanlp_tool.add_word_list( # [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val]) # added_category_id.add(category_item.id) # except ObjectDoesNotExist: # continue neo4j = Neo4j() cout = 0 for i in ret_entity_map: _id = i['_id'] value = i['value'] content = value['内容'] text = HanlpUnit().get_text_from_html(content) sentenceList = self.hanlp_tool.split_paragraph(text) extract_relationship = [] for sent in sentenceList: sent = sent.strip() relationships = self.eventExtractionByTemplateMatching( sent, relationship_list) # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list) for item in relationships: relation_id = item[0] cur_relationship = relationship_list[relation_id] extract_relationship.append({ "object_from_category": cur_relationship[1], "object_to_category": cur_relationship[3], "object_from_name": item[1], "object_relationship_name": item[2], "object_to_name": item[3] }) object1 = neo4j.match( object_from={ "label_name": cur_relationship[1], "content": { "名字": item[1] } }) object2 = neo4j.match( object_from={ "label_name": cur_relationship[3], "content": { "名字": item[3] } }) if object1 is not None and len( object1) == 1 and object2 is not None and len( object2) == 1: neo4j.createRelationship(labelOne=cur_relationship[1], labelTwo=cur_relationship[3], relationShipName=item[2], propertyOne={"名字": item[1]}, propertyTwo={"名字": item[3]}) if "relationship_extract_result" in i: extract_relationship = self.merge_list( extract_relationship, i["relationship_extract_result"]) cout += 1 print( str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship))) collection.update_one({"_id": ObjectId(_id)}, { "$set": { "relationship_extract_result": extract_relationship } })