Пример #1
0
    def eventExtraction(self, request, file_id, lEventCategoryId):
        """
        功能 进行模板匹配的事件抽取
        :param request:                          request参数
        :param file_id:             数据类型str  文件id
        :param lEventCategoryId:    数据类型list 事件类目id
        :return: True
        """
        #加入ruleId 1或者2
        #1的事件是三元组主谓宾 2的话变事件是主谓
        #only for debug
        #request.session['user_id'] = 1
        #request.session['repo_id'] = 1
        #fileId = 13
        #only for debug

        #fileId = request.POST['fileId']
        #request.session['repo_id']=1
        #request.session['user_id']=1
        repoId = request.session['repo_id']
        createId = request.session['user_id']

        #存到这个file_id 里面
        tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId}
        news_col = Mongodb(db='knowledge', collection='text').get_collection()
        cnt = 1
        ret_entity = news_col.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0:
            return

        print("--------------------事件抽取")
        #在这个之前把所有的词语都加进去
        #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去
        retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId)
        eventLabelList = []
        # hanlpUnit=HanlpUnit()
        #这边要修  我们要从事类目开始查询
        for i in retTriggerWordList:
            tmpLableList = []
            ruleId = 1
            retTriggerWordDict = model_to_dict(i)
            triggerId = retTriggerWordDict['id']
            eventId = retTriggerWordDict['event_rule_id']
            #print(111,eventId)
            #触发词名字和触发词标注
            retEventRule = TEventRule.objects.get(id=eventId)
            #print(333,retEventRule.category_id)
            retCategoryName = TCategory.objects.get(
                id=retEventRule.category_id).category_name
            #print(444,retCategoryName)
            #这里的时候触发词的label要变成事件的label
            #到时候改一下
            triggerWord = retTriggerWordDict['trigger_word']
            triggerWordId = BaseController.get_category_name(
                request, retCategoryName)
            #print(222,eventId)

            eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId)
            eventRuleDict = model_to_dict(eventRule)
            eventCategoryId = eventRuleDict['category_id']
            if (eventCategoryId not in lEventCategoryId):
                continue
            eventCategory = TCategory.objects.get(id=eventCategoryId,
                                                  repo_id=repoId,
                                                  create_id=createId)
            eventCategoryDict = model_to_dict(eventCategory)
            eventCategoryName = eventCategoryDict['category_name']
            tmpLableList.append(eventCategoryName)
            #事件类目

            subjectCategoryId = eventRuleDict['event_subject_id']
            subjectCategory = TCategory.objects.get(id=subjectCategoryId,
                                                    repo_id=repoId,
                                                    create_id=createId)
            subjectCategoryDict = model_to_dict(subjectCategory)
            subjectCategoryName = subjectCategoryDict['category_name']
            subjectId = BaseController.get_category_name(
                request, subjectCategoryName)
            tmpLableList.append(subjectId)
            retListId, retListVal = some_data_deal_func(
            ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId)
            #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词
            #构造wordList word 和mask 对应
            constructWordList = []
            tmpSet = self.hanlp_tool.added_word_list
            #print(len(retListVal ))
            for word in retListVal:
                if (word == None):
                    continue
                tmpDict = {}
                tmpDict['word'] = word
                #print(word)
                #item["word"], item["mask"]
                tmpDict['mask'] = subjectId
                constructWordList.append(tmpDict)

            #这边这个要加入list[{'word':123,mask:13}]
            self.hanlp_tool.add_word_list(constructWordList)
            #print(constructWordList)
            objectCategoryId = eventRuleDict['event_object_id']
            negativeOne = -1
            if (objectCategoryId == negativeOne):
                ruleId = 2

            constructWordList = []
            tmpDict = {}
            tmpDict['word'] = triggerWord
            tmpDict['mask'] = str(triggerWordId)
            tmpSet = self.hanlp_tool.added_word_list
            constructWordList.append(tmpDict)
            self.hanlp_tool.add_word_list(constructWordList)
            tmpLableList.append(str(triggerWordId))
            print(ruleId)
            if (ruleId == 1):
                objectCategoryId = eventRuleDict['event_object_id']
                objectCategory = TCategory.objects.get(id=objectCategoryId,
                                                       repo_id=repoId,
                                                       create_id=createId)
                objectCategoryDict = model_to_dict(objectCategory)
                objectCategoryName = objectCategoryDict['category_name']
                objectId = BaseController.get_category_name(
                    request, objectCategoryName)
                retListId, retListVal = some_data_deal_func(
                ).inputCategoryIdReturnName(objectCategoryId, repoId, createId)
                tmpLableList.append(objectId)
                constructWordList = []
                tmpSet = self.hanlp_tool.added_word_list
                #这个代码有变动需要改一下
                for word in retListVal:
                    if (word == None):
                        continue
                    tmpDict = {}
                    tmpDict['word'] = word
                    # item["word"], item["mask"]
                    tmpDict['mask'] = str(objectId)
                    constructWordList.append(tmpDict)
                # 这边这个要加入list[{'word':123,mask:13}]
                #print(constructWordList)
                self.hanlp_tool.add_word_list(constructWordList)

            eventLabelList.append(tmpLableList)

        #eventLabelList
        #事件类目 事件主题  事件触发词 事件客体
        #print(eventLabelList)
        # print("list里面内容")
        # tmpS=self.hanlp_tool.added_word_list
        # for name in tmpS:
        #     print(name)
        #print("list里面内容结束")
        #return True
        #name
        attribute = TAttribute.objects.get(category_id=1)
        attributeDict = model_to_dict(attribute)
        attributeName = attributeDict['attribute_name']
        #print(self.hanlp_tool.added_word_list)
        cnt = 1
        for i in ret_entity_map:
            _id = i['_id']
            #根据这个id放回去就好了
            value = i['value']
            basetime = str(value['时间'])
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)
            sentenceList = self.hanlp_tool.split_paragraph(text)
            #print(sentenceList)
            #这边把所有的东西都拿出来
            event_extract_result = []
            count = 0
            countIndex = 0
            #时间 地点 事件主体 事件客体 主体的类目 和客体的类目
            tmpEventSet = set()
            for sent in sentenceList:
                sent = sent.strip()
                #print(sent)
                #对每一个sent进行分词获取他们的事件
                #11111
                #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》"
                sentenceDealResult = self.hanlp_tool.cut(sent)
                event = self.eventExtractionByTemplateMatching(
                    sent, eventLabelList)
                #事件抽取完成
                #dateTime还要调整一下basetime会出问题
                #print(basetime)

                dateTime = basetime
                timeIndex = -1
                #print(123,timeIndex)
                timeIndex, timeWord, dateTime = Time_deal().dealTime(
                    sent, basetime)
                if (timeIndex != -1):
                    timeIndex = timeIndex + countIndex
                #print(46, timeIndex)
                #print(11111111,dateTime)

                locationList = Time_deal().dealArea(sent)
                location = ''
                locationindex = -1
                for val in locationList:
                    if (len(val['place']) > len(location)):
                        location = val['place']
                        locationindex = val['index'] + countIndex
                #print(location,locationindex)
                countIndex += len(sentenceDealResult)

                #这三个的名字需要和事件一起返回
                #print(event)
                for eve in event:
                    ruleId = 1
                    if (len(eve) == 3):
                        ruleId = 2
                    eveId = eve[0]
                    subjectLabel = eventLabelList[eveId][1]
                    #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0]

                    attribute = {}
                    attribute['发生时间'] = dateTime
                    attribute['地点'] = location
                    eveString = ''

                    for j in range(1, len(eve), 1):
                        eveString = eveString + str(eve[j])
                    attribute['名字'] = eveString
                    #eventlabel要通过查询结果得到
                    eventLabel = BaseController.get_category_name(
                        request, eventLabelList[eveId][0])
                    #print(eventLabel)
                    #print(eventLabelList[eveId])
                    #print(event)
                    subjectLabel = eventLabelList[eveId][1]

                    Neo4j().create_node_mjy_edition(eventLabel, attribute)
                    subjectNameVal = eve[1]
                    # print(subjectCategoryName,attributeName,subjectNameVal)
                    neo4jSubjectId = Neo4j().quesIdByLabelAttribute(
                        subjectLabel, attributeName,
                        '\'' + subjectNameVal + '\'')
                    neo4jEventId = Neo4j().quesIdByLabelAttribute(
                        eventLabel, '名字', '\'' + eveString + '\'')
                    Neo4j().createRelationship(subjectLabel, eventLabel,
                                               "主谓关系", {'id': neo4jSubjectId},
                                               {'id': neo4jEventId})
                    if (ruleId == 1):
                        objectNameVal = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        neo4jObjectId = Neo4j().quesIdByLabelAttribute(
                            objectLabel, attributeName,
                            '\'' + objectNameVal + '\'')
                        Neo4j().createRelationship(eventLabel, objectLabel,
                                                   "动宾关系",
                                                   {'id': neo4jEventId},
                                                   {'id': neo4jObjectId})
                        #print(neo4jSubjectId, neo4jEventId, neo4jObjectId)
                    tmpEventDict = {}
                    tmpEventDict['actual_event_time'] = dateTime
                    #事件抽取内容拿出来
                    tmpEventDict['time'] = timeWord
                    tmpEventDict['timeIndex'] = timeIndex
                    tmpEventDict['location'] = location
                    tmpEventDict['locationIndex'] = locationindex
                    #print(111,dateTime,location)
                    tmpEventDict['eventSubject'] = eve[1]
                    tmpEventDict['eventSubjectLabel'] = subjectLabel
                    tmpEventDict['triggerLabel'] = eventLabel
                    tmpEventDict['triggerWord'] = eve[2]
                    tmpEventDict['eventName'] = eveString
                    if (ruleId == 1):
                        tmpEventDict['eventObject'] = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        tmpEventDict['eventObjectLabel'] = objectLabel
                    if (eveString not in tmpEventSet):
                        tmpEventSet.add(eveString)
                        event_extract_result.append(tmpEventDict)
                    print(tmpEventDict)
                    count += 1
            #插入到mongodb
            #print(count,event_extract_result)
            news_col.update_one(
                {'_id': _id},
                {"$set": {
                    'event_extract_result': event_extract_result
                }})
            #news_col.insert_one()
            cnt += 1
            #if(cnt>=2):
            #     break
        return True
Пример #2
0
    def extract_relationship_from_unstructured_data(
            self, request, file_id, relationship_attribute_list=None):
        """
        从非结构化数据中抽取关系
        :param file_id:文件id,获取mongodb中对应要分析的数据
        :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合
        :param request:
        :return:
        """
        print("------------------------非结构关系抽取")
        tmp_info = {
            'file_id': file_id,
            'user_id': request.session["user_id"],
            'repo_id': request.session["repo_id"]
        }
        collection = Mongodb(db='knowledge',
                             collection='text').get_collection()
        ret_entity = collection.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0 or relationship_attribute_list is None:
            print("无可抽取内容")
            return
        relationship_list = []
        # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1)
        added_category_id = set()
        for attribute_id in relationship_attribute_list:
            cur_attribute = TAttribute.objects.get(id=attribute_id)
            category_from = TCategory.objects.get(id=cur_attribute.category_id)
            data_type = TDataType.objects.get(id=cur_attribute)
            category_to = TCategory.objects.get(id=data_type.category_id)

            category_from_name = BaseController.get_category_name(
                request, category_from.category_name)
            category_to_name = BaseController.get_category_name(
                request, category_to.category_name)

            one_relationship = list()
            one_relationship.append(cur_attribute.attribute_name)
            one_relationship.append(category_from_name)
            one_relationship.append(
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name))
            one_relationship.append(category_to_name)
            relationship_list.append(one_relationship)
            self.hanlp_tool.add_word_list([{
                "word":
                alia_item.attribute_alias,
                "mask":
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name)
            } for alia_item in TAttrbuteAlias.objects.filter(
                attribute_id=cur_attribute.id)])
            if category_from.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_from.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_from_name
                } for val_item in ret_list_val])
                added_category_id.add(category_from.id)
            if category_to.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_to.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_to_name
                } for val_item in ret_list_val])
                added_category_id.add(category_to.id)

        # for category_item in all_category:
        #     try:
        #         one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"])
        #         attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id)
        #         category_to_name = BaseController.get_category_name(request, category_item.category_name)
        #         for attribute_item in attribute_list:
        #             category_from = TCategory.objects.get(id=attribute_item.category_id)
        #             category_from_name = BaseController.get_category_name(request, category_from.category_name)
        #             one_relationship = list()
        #             one_relationship.append(attribute_item.attribute_name)
        #             one_relationship.append(category_from_name)
        #             one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name))
        #             one_relationship.append(category_to_name)
        #             relationship_list.append(one_relationship)
        #             self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             print([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             if category_from.id not in added_category_id:
        #                 ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"])
        #                 self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val])
        #                 added_category_id.add(category_from.id)
        #         if category_item.id not in added_category_id:
        #             ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(
        #                 categoryId=category_item.id, repoId=request.session["repo_id"],
        #                 createId=request.session["user_id"])
        #             self.hanlp_tool.add_word_list(
        #                 [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val])
        #             added_category_id.add(category_item.id)
        #     except ObjectDoesNotExist:
        #         continue
        neo4j = Neo4j()
        cout = 0
        for i in ret_entity_map:
            _id = i['_id']
            value = i['value']
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)

            sentenceList = self.hanlp_tool.split_paragraph(text)
            extract_relationship = []
            for sent in sentenceList:
                sent = sent.strip()

                relationships = self.eventExtractionByTemplateMatching(
                    sent, relationship_list)
                # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list)
                for item in relationships:
                    relation_id = item[0]
                    cur_relationship = relationship_list[relation_id]

                    extract_relationship.append({
                        "object_from_category":
                        cur_relationship[1],
                        "object_to_category":
                        cur_relationship[3],
                        "object_from_name":
                        item[1],
                        "object_relationship_name":
                        item[2],
                        "object_to_name":
                        item[3]
                    })
                    object1 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[1],
                            "content": {
                                "名字": item[1]
                            }
                        })
                    object2 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[3],
                            "content": {
                                "名字": item[3]
                            }
                        })
                    if object1 is not None and len(
                            object1) == 1 and object2 is not None and len(
                                object2) == 1:
                        neo4j.createRelationship(labelOne=cur_relationship[1],
                                                 labelTwo=cur_relationship[3],
                                                 relationShipName=item[2],
                                                 propertyOne={"名字": item[1]},
                                                 propertyTwo={"名字": item[3]})
            if "relationship_extract_result" in i:
                extract_relationship = self.merge_list(
                    extract_relationship, i["relationship_extract_result"])
            cout += 1
            print(
                str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship)))
            collection.update_one({"_id": ObjectId(_id)}, {
                "$set": {
                    "relationship_extract_result": extract_relationship
                }
            })