示例#1
0
def depend_analysi(a, words_all, words, postags_a, sentences, X):
    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(words, postags_a)  # 句法分析

    # rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
    #print(rely_id)
    relation = [arc.relation for arc in arcs]  # 提取依存关系
    #print(relation)
    # heads = ['Root' if id == 0 else words[id-1] for id in rely_id]  # 匹配依存父节点词语

    mixed = [word for word in words if word in say]
    # ne = get_name_entity(sentences[a],postags_a)
    # print(ne)
    name = ''
    stack = []
    c = a + 1
    d = a - 1
    # saying = ""
    for k, v in enumerate(arcs):
        # save the most recent Noun
        if postags_a[k] in ['nh', 'ni', 'ns']:
            stack.append(words[k])
        if v.relation == 'SBV' and (words[v.head - 1] in mixed):  # 确定第一个主谓句
            name = get_name(words[k], words[v.head - 1], words,
                            relation)  #, ne)
            saying = get_saying(words, relation, [i.head for i in arcs],
                                v.head)
            print(name)
            if not saying:
                if "“" and "”" in words_all[a - 1]:
                    saying = sentences[a - 1].strip()

                if "“" and "”" in words_all[a + 1]:
                    saying += sentences[a + 1].strip()

                if not saying:
                    #与上一句对比
                    p = text_similarity(a - 1, X)
                    #与下一句对比
                    z = text_similarity(a, X)
                    if p < z:
                        saying = sentences[a - 1].strip()
                        return "在第{}句话中  {}  {}".format(
                            a, name, words[v.head - 1]) + ":{}".format(saying)
                    if p >= z:
                        # saying = sentences[a+1].strip()
                        saying = re.sub(r'[^\w]', '', sentences[a + 1].strip())

                # quotations = re.findall(r'“(.+?)”', sentences[a])#???不明白
                # print(quotations)
                # if quotations:
                #     says = quotations[-1]
                #     print(says)

                for i in range(min(len(sentences) - c - 1, 3)):
                    k = text_similarity(c, X)
                    print(k)
                    if (k <= 0.9):
                        # print(saying)
                        saying += sentences[c + 1]
                        # sentences_all[a] = sentences_all[a+1]
                        c += 1
                    else:
                        break
                for i in range(min(d, 3)):
                    z = text_similarity_up(d, X)
                    print("up{}".format(z))
                    if (z <= 0.9):
                        # print(saying)
                        saying = sentences[d] + saying
                        # sentences_all[a] = sentences_all[a+1]
                        d -= 1
                    else:
                        break
            return "在第{}句话中  {}  {}".format(
                a, name, words[v.head - 1]) + ":{}".format(saying)
        # 若找到‘:’后面必定为言论。
        if words[k] == ':':
            name = stack.pop()
            saying = ''.join(words[k + 1:])
            return name, saying

    parser.release()
    return False

    # for i in range(len(words)):
    #     #print(relation[i] + '(' + words[i] + ', ' + heads[i] + ')')
    #     if relation[i] == "SBV":#找出"SBV"模型下主语
    #         if heads[i] in say: #如果句子里say[]的词
    #             print("在第{}句话中 {} {}".format(a,words[i],heads[i])+":{}".format(sentences[a+1]))
    #             article_result = "在第{}句话中 {} {}".format(a,words[i],heads[i])+":{}".format(sentences[a+1])
    #             return article_result
    # parser.release()  # 释放模型par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model
    return
示例#2
0
class ltp_api(object):
    def __init__(self, MODELDIR, exword_path='lexion'):
        self.MODELDIR = MODELDIR
        # self.output = {}
        self.words = None
        self.postags = None
        self.netags = None
        self.arcs = None
        self.exword_path = exword_path  # e.x: '‪E:\LTP\ltp_data_v3.4.0\exwords.txt'
        # 分词
        self.segmentor = Segmentor()
        if not self.exword_path:
            # 是否加载额外词典
            self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(
                os.path.join(self.MODELDIR, "cws.model"), self.exword_path)
        # 模型引用
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        # # 命名实体识别
        # self.recognizer = NamedEntityRecognizer()
        # self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        # 语义角色
        # self.labeller = SementicRoleLabeller()
        # self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    # 分句
    def ltp_sentence_splitter(self, paragraph):
        sentence = SentenceSplitter.split(paragraph)  # 分句的列表
        # print ('\n'.join(sentence))
        return sentence

    # 分词
    def ltp_segmentor(self, sentence):
        words = self.segmentor.segment(sentence)
        # self.segmentor.release()
        return words  #返回词的列表

    # 词性标注
    def ltp_postagger(self, words):
        postags = self.postagger.postag(words)
        # self.postagger.release()
        return postags  #返回词性的列表

    # 依存语法
    def ltp_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        # self.parser.release()
        return arcs

    # 命名实体识别
    def ltp_recognizer(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        # self.recognizer.release()
        return netags

    # 语义角色识别
    def ltp_labeller(self, words, postags, arcs):
        output = []
        roles = self.labeller.label(words, postags, arcs)
        # self.labeller.release()
        for role in roles:
            output.append([(role.index, arg.name, arg.range.start,
                            arg.range.end) for arg in role.arguments])
        return output

    # 各种结果
    def get_result(self, sentence):
        self.words = self.ltp_segmentor(sentence)  # 句子变成词
        self.postags = self.ltp_postagger(self.words)  # 词性标注
        self.arcs = self.ltp_parser(self.words, self.postags)  # 依存句法
        self.netags = self.ltp_recognizer(self.words, self.postags)  # 命名实体
        # 载入output,以字典形式输出各种结果
        self.output['role'] = self.ltp_labeller(self.words, self.postags,
                                                self.arcs)  # 语义角色
        self.output['words'] = list(self.words)
        self.output['postags'] = list(self.postags)
        self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs]
        self.output['netags'] = list(self.netags)
        return self.output

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
示例#3
0
class Model:
    def __init__(self):
        self.name_says = defaultdict(
            list)  #定义成全局变量有可能从sentence_process()中写入,也可能从single_sentence()写入
        self.model = Word2Vec.load(path)
        self.word_total_count = self.model.corpus_total_words
        self.word_dict = self.model.wv.vocab
        self.dim = 256

        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型

        self.say_sim = [
            '诊断', '交代', '说', '说道', '指出', '报道', '报道说', '称', '警告', '所说', '告诉',
            '声称', '表示', '时说', '地说', '却说', '问道', '写道', '答道', '感叹', '谈到', '说出',
            '认为', '提到', '强调', '宣称', '表明', '明确指出', '所言', '所述', '所称', '所指', '常说',
            '断言', '名言', '告知', '询问', '知道', '得知', '质问', '问', '告诫', '坚称', '辩称',
            '否认', '还称', '指责', '透露', '坦言', '表达', '中说', '中称', '他称', '地问', '地称',
            '地用', '地指', '脱口而出', '一脸', '直说', '说好', '反问', '责怪', '放过', '慨叹', '问起',
            '喊道', '写到', '如是说', '何况', '答', '叹道', '岂能', '感慨', '叹', '赞叹', '叹息',
            '自叹', '自言', '谈及', '谈起', '谈论', '特别强调', '提及', '坦白', '相信', '看来', '觉得',
            '并不认为', '确信', '提过', '引用', '详细描述', '详述', '重申', '阐述', '阐释', '承认',
            '说明', '证实', '揭示', '自述', '直言', '深信', '断定', '获知', '知悉', '得悉', '透漏',
            '追问', '明白', '知晓', '发觉', '察觉到', '察觉', '怒斥', '斥责', '痛斥', '指摘', '回答',
            '请问', '坚信', '一再强调', '矢口否认', '反指', '坦承', '指证', '供称', '驳斥', '反驳',
            '指控', '澄清', '谴责', '批评', '抨击', '严厉批评', '诋毁', '责难', '忍不住', '大骂',
            '痛骂', '问及', '阐明'
        ]
        self.valid_sentence = []

        self.parser = Parser()
        self.parser.load(par_model_path)

        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)

    # @functools.lru_cache()
    # @fn_timer
    def get_count(self, word):
        """
        O(1)
        """
        # word_count = 0 #定义默认值
        vector = np.zeros(1)  #定义默认值

        if word in self.word_dict:
            wf = self.word_dict[word].count
            wv = self.model.wv[word]
        else:
            wf = 1
            wv = np.zeros(self.dim)
        return wf / self.word_total_count, wv

    #获取句子向量
    #TODO: 计算P(w)的过程可以优化
    def sentence_embedding(self, sentence):
        # 按照论文算法Vs=1/|s|*∑a/(a+p(w))*Vw
        sentences = self.process_content(sentence).replace(' ', '')
        a = 1e-3  #0.001

        words = self.pyltp_cut(sentences)
        sum_vector = np.zeros(self.dim)
        for i, w in enumerate(words):
            wf, wv = self.get_count(w)
            sum_vector += a / (a + wf) * wv
        return sum_vector / (i + 1)

    # 欧式距离
    def euclidSimilar(self, inA, inB):
        return 1.0 / (1.0 + la.norm(inA - inB))

    # 皮尔逊相关系数
    def pearsonSimilar(self, inA, inB):
        if len(inA) != len(inB):
            return 0.0
        if len(inA) < 3:
            return 1.0
        return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar=0)[0][1]

    # 余弦相似度
    def cosSimilar(self, inA, inB):
        inA = np.mat(inA)
        inB = np.mat(inB)
        num = float(inA * inB.T)
        denom = la.norm(inA) * la.norm(inB)
        return 0.5 + 0.5 * (num / denom)

    # 句子依存分析
    def parsing(self, sentence):
        words = self.pyltp_cut(sentence)  # pyltp分词
        postags = self.postagger.postag(words)  # 词性标注
        arcs = self.parser.parse(words, postags)  # 句法分析
        return arcs

    # 命名实体
    # @functools.lru_cache()
    def get_name_entity(self, strs):
        sentence = ''.join(strs)
        words = self.pyltp_cut(sentence)  #pyltp分词更合理
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        return netags

    # 输入单个段落句子数组
    def valid_sentences_(self, sentences, res):
        expect = 0.76

        tmp = ""  # 储存前一个言论
        while sentences:
            curr = sentences.pop(0)
            if curr[0] == '“':  # 当前句子或为 “言论在发言人前的直接引用”。
                print(curr)
                people = re.search('”(.+)“|”(.+)', curr)  # 提取发言人所在句段
                if people:
                    people = [i for i in people.groups() if i][0]
                elif res:
                    res[-1][1] += '。' + curr
                    continue
                else:
                    continue

                saying = curr.replace(people, '')  # 剩余部分被假设为“言论”
                if res and self.judge_pronoun(people):
                    res[-1][1] += '。' + saying
                else:
                    comb = self.single_sentence(people)
                    if comb:
                        saying += comb[1] if comb[1] else ''
                        res.append([comb[0], saying])
                continue

            # 尝试提取新闻 发言人,言论内容
            combi = self.single_sentence(curr)

            # 无发言人: 当前句子属于上一个发言人的言论 或 不属于言论
            if not combi:
                if res and tmp and self.compare_sentence(
                        tmp, curr) > expect:  #基于句子相似度判断
                    print('{} - {} : {}'.format(
                        tmp, curr, self.compare_sentence(tmp, curr)))
                    res[-1][1] += '。' + curr
                    tmp = curr
                continue

            # 有发言人: 提取 发言人 和 言论。
            name, saying = combi
            if res and self.judge_pronoun(curr) and saying:
                res[-1][1] += '。' + saying
            elif saying:
                res.append([name, saying])
            tmp = saying
        return res

    @functools.lru_cache()
    def single_sentence(self, sentence, just_name=False, ws=False):
        sentence = ','.join([x for x in sentence.split(',') if x])
        cuts = list(self.pyltp_cut(sentence))  # pyltp分词更合理
        # 判断是否有‘说’相关词:
        mixed = [word for word in cuts if word in self.say_sim]
        if not mixed: return False

        ne = self.get_name_entity(tuple(sentence))  #命名实体
        wp = self.parsing(sentence)  #依存分析
        wp_relation = [w.relation for w in wp]
        postags = list(self.postagger.postag(cuts))
        name = ''

        stack = []
        for k, v in enumerate(wp):
            # save the most recent Noun
            if postags[k] in ['nh', 'ni', 'ns']:
                stack.append(cuts[k])

            if v.relation == 'SBV' and (cuts[v.head - 1] in mixed):  #确定第一个主谓句
                name = self.get_name(cuts[k], cuts[v.head - 1], cuts,
                                     wp_relation, ne)

                if just_name == True: return name  #仅返回名字
                says = self.get_says(cuts, wp_relation, [i.head for i in wp],
                                     v.head)
                if not says:
                    quotations = re.findall(r'“(.+?)”', sentence)
                    if quotations: says = quotations[-1]
                return name, says
            # 若找到‘:’后面必定为言论。
            if cuts[k] == ':':
                name = stack.pop()
                says = ''.join(cuts[k + 1:])
                return name, says
        return False

    # 输入主语第一个词语、谓语、词语数组、词性数组,查找完整主语
    def get_name(self, name, predic, words, property, ne):
        index = words.index(name)
        cut_property = property[index + 1:]  #截取到name后第一个词语
        pre = words[:index]  #前半部分
        pos = words[index + 1:]  #后半部分
        #向前拼接主语的定语
        while pre:
            w = pre.pop(-1)
            w_index = words.index(w)

            if property[w_index] == 'ADV': continue
            if property[w_index] in ['WP', 'ATT', 'SVB'] and (w not in [
                    ',', '。', '、', ')', '('
            ]):
                name = w + name
            else:
                pre = False

        while pos:
            w = pos.pop(0)
            p = cut_property.pop(0)
            if p in ['WP', 'LAD', 'COO', 'RAD'] and w != predic and (w not in [
                    ',', '。', '、', ')', '('
            ]):
                name = name + w  # 向后拼接
            else:  #中断拼接直接返回
                return name
        return name

    # 获取谓语之后的言论
    def get_says(self, sentence, property, heads, pos):
        # word = sentence.pop(0) #谓语
        if ':' in sentence:
            return ''.join(sentence[sentence.index(':') + 1:])
        while pos < len(sentence):
            w = sentence[pos]
            p = property[pos]
            h = heads[pos]
            # 谓语尚未结束
            if p in ['DBL', 'CMP', 'RAD']:
                pos += 1
                continue
            # 定语
            if p == 'ATT' and property[h - 1] != 'SBV':
                pos = h
                continue
            # 宾语
            if p == 'VOB':
                pos += 1
                continue
            # if p in ['ATT', 'VOB', 'DBL', 'CMP']:  # 遇到此性质代表谓语未结束,continue
            #    continue
            else:
                if w == ',':
                    return ''.join(sentence[pos + 1:])
                else:
                    return ''.join(sentence[pos:])

    #解析处理语句并返回给接口
    def sentence_process(self, sentence):
        # 文章 -->清除空行
        # 文章 -->句号分割:如果句号分割A.B, 若B存在‘说’,对B独立解析,否则判断A | B是否相似,确定A是否抛弃B句。
        # 句子 -->确定主谓宾: 依存分析、命名实体识别 -->首先要找到宾语,然后确定宾语是否与说近似,若存在多个与‘说’近似,确定第一个为陈述。在说前找命名实体,说后面到本句结尾为宾语
        # 命名实体 -->通过命名实体识别,若S - NE, NE = S - NE。若B - NE / I - NE / E - NE,NE = B - NE + I - NE + E - NE

        self.name_says = defaultdict(list)
        sentence = sentence.replace('\r\n', '\n')
        sections = sentence.split('\n')  #首先切割成段落
        sections = [s for s in sections if s.strip()]
        valids = ''

        res = []
        for sec in sections:  #段落
            sentence_list = split(sec)
            sentence_list = [s.strip() for s in sentence_list if s.strip()]
            self.cut_sententce_for_name = [s for s in sentence_list if s]
            # valids = self.valid_sentences(sentence_list)
            res += self.valid_sentences_(sentence_list, [])
        if res:
            self.name_says = defaultdict()
            for name, saying in res:
                if name and saying:
                    self.name_says[name] = self.name_says.get(
                        name, '') + saying + ' | '
        return self.name_says

    # 判断是否为代词结构句子“他认为...,他表示....”
    #@fn_timer
    def judge_pronoun(self, sentence):
        subsentence = re.search('(.+)“|”(.+)', sentence)
        if subsentence:
            sentence = subsentence.group(1)
        cuts = list(self.pyltp_cut(sentence))  # 确定分词
        wp = self.parsing(sentence)  # 依存分析
        postags = list(self.postagger.postag(cuts))
        for k, v in enumerate(wp):
            if v.relation == 'SBV' and postags[k] == 'r':  # 确定第一个主谓句
                return True
        return False

    #句子比对皮尔逊系数
    def compare_sentence(self, inA, inB):
        inC = self.sentence_embedding(inA)
        inD = self.sentence_embedding(inB)
        return self.pearsonSimilar(inC, inD)  #皮尔逊
        # print(self.euclidSimilar(inC,inD))
        # print(self.pearsonSimilar(inC,inD))
        # print(self.cosSimilar(inC,inD))
        # print('------------------------')

    #pyltp中文分词
    def pyltp_cut(self, sentence):
        # segmentor = Segmentor()  # 初始化实例
        # segmentor.load(cws_model_path)  # 加载模型
        words = self.segmentor.segment(sentence)  # 分
        # segmentor.release()  # 释放模型
        return words

    #结巴词性标注
    def jieba_pseg(self, sentence):
        return pseg.cut(sentence)

    def document_frequency(self, word, document):
        if sum(1 for n in document if word in n) == 0:
            print(word)
            print(type(document))
            print(len(document))
            print(document[0])
        return sum(1 for n in document if word in n)

    def idf(self, word, content, document):
        """Gets the inversed document frequency"""
        return math.log10(
            len(content) / self.document_frequency(word, document))

    def tf(self, word, document):
        """
        Gets the term frequemcy of a @word in a @document.
        """
        words = document.split()

        return sum(1 for w in words if w == word)

    def process_content(self, content):
        content = re.sub('[+——() ? 【】“”!,:。?、~@#¥%……&*()《 》]+', '', content)
        content = ' '.join(jieba.cut(content))
        return content

    def release_all(self):
        self.segmentor.release()
        self.recognizer.release()
        self.parser.release()
        self.postagger.release()
示例#4
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()
示例#5
0
文件: DSFN9.0.2.py 项目: 89935/OpenRE
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
    location_entity = [
        "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿",
        "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫",
        "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清",
        "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河",
        "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫",
        "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝",
        "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林",
        "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城",
        "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛",
        "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城",
        "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山",
        "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山",
        "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼",
        "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川",
        "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼",
        "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街",
        "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门",
        "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海",
        "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石",
        "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔",
        "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿",
        "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿",
        "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪",
        "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼",
        "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方",
        "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛",
        "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平",
        "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头",
        "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁",
        "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园",
        "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿",
        "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿",
        "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜",
        "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫",
        "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛",
        "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙",
        "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫"
    ]

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor_user = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag_user = self.segmentor_user.load_with_lexicon(
            os.path.join(default_model_dir, 'cws.model'), user_dict)
        self.segmentor = Segmentor()
        segmentor_flag = self.segmentor.load(
            os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag or segmentor_flag_user:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, segmentor, entity_postag=dict()):
        words = segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def splitSentence(self, text):
        pattern = r'。|!|?|;|='
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        #    print(result_list)
        return result_list

    def splitSentenceByComma(self, text):
        pattern = r','
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        final_list = []
        for sentence in result_list:
            if len(sentence) <= 40:
                final_list.append(sentence)
        return final_list

    def not_empty(self, s):
        return s and "".join(s.split())

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        # location_position_list = ['主席','总统','总理','主任','内','东门','西门','南门','北门','大门','外','国家主席','尚书'
        #                           ]
        """
        去除约束2
        """
        if (item1.dependency == "ATT"):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT"):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (
                                item.dependency == "ATT"):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(),
                                     key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")")
                # if AttWordStr in location_position_list:
                allTripes.append([item1.lemma, AttWordStr, item2.lemma])

        return allTripes

    def get_entity_num_between(self, verb1, verb2, sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID - 1:
            if self.is_entity(sentence.words[i]):
                num += 1
            i += 1
        return num

    def is_entity(self, entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm']
        # print(entry.lemma+" : "+entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False

    def dsfnAttCOO(self, sentence, item1, item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))
    def dsfn6COO(self, sentence, item1, item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2

    def dsfn5and6COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[
                            2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[
                            0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe

    def dsfnStart(self, rawSentence, segmentor, entity1, entity2, all_entity):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence, segmentor)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        # print(sentence.to_string())
        Rawitem1 = None
        Rawitem2 = None
        item1 = None
        item2 = None
        Rawitem1Index = -1
        Rawitem2Index = -1
        indexList = [-1, -1]
        for item in sentence.words:
            if (item.lemma == entity1):
                Rawitem1 = item
            if (item.lemma == entity2):
                Rawitem2 = item
            if Rawitem1 != None and Rawitem2 != None and (
                    Rawitem1.ID != Rawitem1Index
                    or Rawitem2.ID != Rawitem2Index):
                Rawitem1Index = Rawitem1.ID
                Rawitem2Index = Rawitem2.ID
                # if item1 == None or item2 == None:
                #     return None
                item1 = Rawitem1
                item2 = Rawitem2
                if item1.ID > item2.ID:
                    c = item1
                    item1 = item2
                    item2 = c
                # print(str(item1.ID) + "   " + str(item2.ID))
                itemCopy1 = item1
                itemCopy2 = item2
                # print(item1.lemma)
                # print(item2.lemma)
                # print(self.dsfnConstraints2(sentence,item1,item2,all_entity))
                if self.dsfnConstraints2(sentence, item1, item2,
                                         all_entity) == False:

                    continue
                allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                # print("111"+item2.lemma)
                # print(allTripes)
                if allTripes == None or (allTripes != None
                                         and len(allTripes) == 0):
                    # print("我要走ATT的部分了")
                    while item1.dependency == "ATT":
                        item1 = item1.head_word
                    while item2.dependency == "ATT":
                        item2 = item2.head_word
                    allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                    if len(allTripes) != 0:
                        for tripe in allTripes:
                            if tripe[1] != "":
                                if tripe[0] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[
                                            0] = item1.lemma + "" + itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[
                                            0] = itemCopy1.lemma + "" + item1.lemma
                                    else:
                                        tripe[0] = itemCopy1.lemma

                                elif tripe[2] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[
                                            2] = item1.lemma + "" + itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[
                                            2] = itemCopy1.lemma + "" + item1.lemma
                                    else:
                                        tripe[2] = itemCopy1.lemma
                                    # tripe[2] = itemCopy1.lemma

                                if tripe[0] == item2.lemma:
                                    if item2.ID < itemCopy2.ID:
                                        tripe[
                                            0] = item2.lemma + "" + itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[
                                            0] = itemCopy2.lemma + "" + item2.lemma
                                    else:
                                        tripe[0] = itemCopy2.lemma
                                elif tripe[2] == item2.lemma:
                                    # print(item2.lemma)
                                    if item2.ID < itemCopy2.ID:
                                        tripe[
                                            2] = item2.lemma + "" + itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[
                                            2] = itemCopy2.lemma + "" + item2.lemma
                                    else:
                                        tripe[2] = itemCopy2.lemma
                                # print("12345")
                                resultList.append(tripe)
                else:
                    for tripe in allTripes:
                        if tripe[1] != "":
                            resultList.append(tripe)
                    # if len(resultList) > 0:
                    #     return np.array(set([tuple(t) for t in resultList]))
        if item1 == None or item2 == None:
            return None
        if len(resultList) > 0:
            # return np.array(set([tuple(t) for t in resultList]))
            # print("输出结果1"+str(resultList))
            return resultList

    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        # print(item1.lemma)
        # print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            # print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                # print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    # print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        # print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID
                                == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item
                            # print(sentence.to_string())
                            # print(item1.lemma)
                            # print(item2.lemma)
                            allTripes = self.dsfn1_2_3_4COO(
                                sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(
                                    sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(
                                        sentence, item1, item2)
                                    if allTripes == None or len(
                                            allTripes) == 0:
                                        # print("3333333")
                                        allTripes = self.dsfn5and6COO(
                                            sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    resultList.append(tripe)
        # print(np.array(set([tuple(t) for t in resultList])))
        return resultList

    def dsfnConstraints1(self, rawSentence, maxLength):
        """
        :param rawSentence: 原句子
        :param maxLength: 句子的最大长度
        :return: 小于maxLength的长度
        """
        newSentence = []

        if len(rawSentence) <= maxLength:
            newSentence.append(rawSentence)
            return newSentence
        else:
            newSentence = self.splitSentenceByComma(rawSentence)
            return newSentence

    def dsfnConstraints2(self, sentence, item1, item2, allEntities):
        countEntity = 0
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
            if word.lemma in allEntities:
                countEntity += 1
        # print(countEntity)
        # print(countChar)
        if countEntity > 3:
            return False
        elif countChar > 12:
            # print(countChar)
            return False
        else:
            return True

    def dsfnConstraints3(self, sentence, item1, item2):
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
        if countChar > 5:
            return False
        else:
            return True

    def getSPO(self, sentence, segmentor):
        all_result = []
        raw_sentence = []
        RawSentence = sentence
        lemmas = self.segment(sentence, segmentor)
        words = self.postag(lemmas)
        words_netag = self.netag(words)
        sentence = self.parse(words_netag)
        # print(sentence.to_string())
        for itemWord in sentence.words:
            #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系
            if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and
                                                                  itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\
                     or (itemWord.postag == "v") :
                relation_verb = itemWord  #将找到的这个动词,作为relation_verb
                relationString = relation_verb.lemma
                # print(relationString)
                if itemWord.head_word == None:
                    # print("1")
                    verbId = itemWord.ID  #关系动词的ID
                    verbId2 = None
                elif itemWord.head_word.head_word == None:
                    # print("2")

                    verbId = itemWord.ID  #该关系动词的ID
                    if itemWord.dependency == "COO" or self.get_entity_num_between(
                            itemWord, itemWord.head_word, sentence) == 0:
                        verbId2 = itemWord.head_word.ID  # 这句话的HED,用来找SUB
                    else:
                        verbId2 = None
                else:
                    # print("3")
                    verbId = itemWord.ID  #该关系动词的ID
                    verbId2 = None
                O_dict = dict()  #存储所有的Object
                S_dict = dict()  #存储所有的Subject
                verb_dict = dict()  #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲
                OBJ = None
                SUB = None
                DSFN3 = dict()
                for item in sentence.words:
                    if item.dependency == "SBV" and item.head_word.ID == verbId:  #寻找这个动词的主语
                        # if SUB == None or SUB.lemma != entity:
                        SUB = item  #找到主语
                        S_dict[SUB.ID] = SUB.lemma  #将主语加入到字典中

                    if (item.dependency == "VOB"
                            and item.head_word.ID == verbId
                            and item.postag != "v"):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma
                        verb_dict[OBJ.ID] = relationString
                    if (item.dependency == "POB"
                            and item.head_word.postag == "p"
                            and item.head_word.dependency == "CMP"
                            and item.head_word.head_word.ID == verbId):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma + "" + item.head_word.lemma
                        verb_dict[OBJ.ID] = relationString

                    if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\
                            and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \
                            and item.postag!='v'):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        verbObj = None
                        DSFN3[OBJ.ID] = True
                        objectDict = dict()
                        relationString = relation_verb.lemma
                        for eachWord in sentence.words:
                            if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID:
                                # relationString = relation_verb.lemma + "" + eachWord.lemma
                                verbObj = eachWord
                                objectDict[verbObj.ID] = verbObj
                        if verbObj != None:
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            relationString = relation_verb.lemma + "" + objectStr

                        else:
                            for eachWord in sentence.words:
                                if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\
                                    eachWord.head_word.head_word.ID == relation_verb.ID:
                                    relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma

                        verb_dict[OBJ.ID] = relationString

                if SUB == None:  #如果没找到主语,那么就找与该动词并列的verbId2的主语
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId2:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma
                # print(verbId2)
                if OBJ == None:
                    verb_coo = None
                    for item in sentence.words:
                        if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId:
                            verb_coo = item
                            break
                    flag = True
                    if verb_coo != None and self.get_entity_num_between(
                            relation_verb, verb_coo, sentence) == 0:

                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID:
                                flag = False
                        if flag != False:
                            for item in sentence.words:
                                if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\
                                        or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID):

                                    OBJ = item
                                    O_dict[OBJ.ID] = OBJ.lemma
                # print(S_dict)
                # print(verb_dict)
                # print(O_dict)
                SUB_COO = None
                OBJ_COO = None
                for item in sentence.words:
                    if item.head_word != None:
                        if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict:  #获得主语的COO
                            SUB_COO = item
                            S_dict[SUB_COO.ID] = SUB_COO.lemma
                    if item.head_word != None and OBJ != None:
                        if item.dependency == "COO" and item.head_word.ID in O_dict:  #获得宾语的COO
                            OBJ_COO = item
                            O_dict[OBJ_COO.ID] = OBJ_COO.lemma
                S_new = []

                for sub in S_dict:
                    # if sentence.get_word_by_id(sub).postag == 'r':
                    #     continue
                    S_dict2 = dict()  # 存放主语ATT的列表
                    S_dict2[sub] = S_dict[sub]
                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                SUBList = S_dict2.keys()
                                if item.head_word.ID in SUBList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "ADV"):
                                    SUBATT = item
                                    S_dict2[SUBATT.ID] = SUBATT.lemma

                            if len(S_dict2) != len1:
                                flag = True
                            else:
                                flag = False
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                    Subject = ""
                    for i in S_dict2:
                        Subject += i[1]
                    S_new.append(Subject)

                O_new = []
                V_new = []
                for obj in O_dict:
                    # if sentence.get_word_by_id(obj).postag == 'r':
                    #     continue
                    O_dict2 = dict()  # 存放宾语ATT的列表
                    O_dict2[obj] = O_dict[obj]
                    if verb_dict != None:
                        if obj in verb_dict:
                            relationString2 = verb_dict[obj]
                        else:
                            relationString2 = relation_verb.lemma
                    else:
                        relationString2 = relation_verb.lemma
                    V_new.append(relationString2)
                    flag = True
                    while flag == True:
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                OBJList = O_dict2.keys()
                                if item.head_word.ID in OBJList and (
                                        item.dependency == "ADV"
                                        or item.dependency == "ATT"
                                        or item.dependency == "VOB" or
                                    (item.dependency == "COO"
                                     and item.head_word.ID != obj)):
                                    if item.dependency == "ATT" and item.postag == "v":
                                        if self.get_entity_num_between(
                                                item,
                                                sentence.get_word_by_id(obj),
                                                sentence) > 0:
                                            continue
                                        else:
                                            OBJATT = item
                                            O_dict2[OBJATT.ID] = OBJATT.lemma
                                    else:
                                        OBJATT = item
                                        O_dict2[OBJATT.ID] = OBJATT.lemma
                                        # print(OBJATT.lemma)

                            if len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False  #一直循环,直到找不到新的修饰词
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                    Object = ""
                    for i in O_dict2:
                        Object += i[1]
                    flag = False
                    # if obj in DSFN3:
                    #     for location in self.location_entity:
                    #         if location in Object :
                    #             flag = True
                    #     if flag == True:
                    #         O_new.append(Object)
                    #     if flag == False:
                    #         O_new.append("")
                    # else:
                    O_new.append(Object)
                # print(O_dict)
                # print(O_new)

                for sub in S_new:
                    for i in range(0, len(O_new)):
                        obj = O_new[i]
                        relationWord = V_new[i]
                        if obj != "":
                            # print(RawSentence)
                            # print((sub, relationWord, obj))
                            all_result.append([sub, relationWord, obj])
                            raw_sentence.append(RawSentence)

        return all_result, raw_sentence

    def hasEntity(self, word, allEntity):
        for entity in allEntity:
            if entity in word:
                # print(entity)
                return True
        return False

    def PostProcessSPO(self, rawSentence, allTripes, allEntity):
        output_list = []
        for i in range(0, len(allTripes)):
            tripe = allTripes[i]
            sub = tripe[0]
            obj = tripe[2]
            # print(sub)
            # print(obj)
            if self.hasEntity(sub, allEntity) and self.hasEntity(
                    obj, allEntity):
                output_list.append(tripe)
        return output_list
class ltp_api(object):
    def __init__(self, MODELDIR, exword_path=None):
        self.MODELDIR = MODELDIR
        self.output = {}
        self.words = None
        self.postags = None
        self.netags = None
        self.arcs = None
        self.exword_path = exword_path  # e.x: '/data1/research/matt/ltp/exwords.txt'
        # 分词
        self.segmentor = Segmentor()
        if not self.exword_path:
            # 是否加载额外词典
            self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(
                os.path.join(self.MODELDIR, "cws.model"), self.exword_path)

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        # 语义角色
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    # 分词
    def ltp_segmentor(self, sentence):
        words = self.segmentor.segment(sentence)
        return words

    # 词性标注
    def ltp_postagger(self, words):
        postags = self.postagger.postag(words)
        return postags

    # 依存语法
    def ltp_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        return arcs

    # 命名实体识别
    def ltp_recognizer(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        return netags

    # 语义角色识别
    def ltp_labeller(self, words, postags, arcs):
        output = []
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            output.append([(role.index, arg.name, arg.range.start,
                            arg.range.end) for arg in role.arguments])
        return output

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_result(self, sentence):
        self.words = self.ltp_segmentor(sentence)
        self.postags = self.ltp_postagger(self.words)
        self.arcs = self.ltp_parser(self.words, self.postags)
        self.netags = self.ltp_recognizer(self.words, self.postags)
        self.output['role'] = self.ltp_labeller(self.words, self.postags,
                                                self.arcs)

        # 载入output
        self.output['words'] = list(self.words)
        self.output['postags'] = list(self.postags)
        self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs]
        self.output['netags'] = list(self.netags)
示例#7
0
class Extractor():

    def __init__(self):
        self.__clause_list = []
        self.__subclause_dict = {}
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__labeller = SementicRoleLabeller()
        self.__words_full_list = []
        self.__netags_full_list = []

    @property
    def clause_list(self):
        return self.__clause_list

    @property
    def triple_list(self):
        return self.__triple_list


    def split(self, words, postags):
        start = 0
        for j, w in enumerate(words):
            if w == ',' or w == ',' or w == '。':
                clause = Clause(start, j-1 )
                self.__clause_list.append(clause)
                start = j + 1

        for clause in self.__clause_list:
            clause.split(postags)
            for subclause in clause.sub_clause_list:
                self.add_inverted_idx(subclause)

    def add_inverted_idx(self, subclause):
        for i in range(subclause.start_idx, subclause.end_idx):
            self.__subclause_dict[i] = subclause

    def load(self):
        self.__segmentor.load('ltp_data/cws.model')
        self.__postagger.load('ltp_data/pos.model')
        self.__recognizer.load('ltp_data/ner.model')
        self.__parser.load('ltp_data/parser.model')
        self.__labeller.load('ltp_data/srl')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()
        self.__labeller.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []
    
    def resolve_conference(self, entity):
        try:
            e_str = entity.get_content_as_str()
        except Exception:
            return '?'
        ref = e_str
        if e_str == '他' or e_str == '她':
            for i in range(entity.loc, -1, -1):
                if self.__netags_full_list[i].lower().endswith('nh'):
                    ref = self.__words_full_list[i]
                    break
        return ref
    
    def resolve_all_conference(self):
        for t in self.triple_list:
            e_str = self.resolve_conference(t.entity_1)
            try:
                t.entity_1.content = e_str.split()
            except Exception:
                pass

    def chunk_str(self, data):
        sents = SentenceSplitter.split(data)
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                roles = self.__labeller.label(words, postags, netags, arcs)
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
            except Exception as e:
                print(str(e))
                pass

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i for i,x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO']
        relations.insert(0,root)

        prev_e1 = None
        e1      = None
        for rel in relations:

            left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV']

            if len(left_arc) > 1:
                pass
                #raise Exception('More than 1 left arc is detected!')
            elif len(left_arc) == 0:
                e1 = prev_e1
            elif len(left_arc) == 1:
                left_arc = left_arc[0]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost)


            prev_e1 = e1

            right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB']

            e2_list = []
            if not right_arc:
                e2 = Entity(2, None)
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO']
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)


                    e2 = None

                    if count == 0:
                        e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost)
                    else:
                        p1 = range(leftmost, right_arc[0])
                        p2 = range(item, find_farthest_vob(arcs, item) + 1)
                        e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    r = Relation(words[rel])
                    t = Triple(e1, e2, r)
                    self.__triple_list.append(t)
                    count += 1
示例#8
0
def parse(s, isGraph=False):
    """
    对语句进行句法分析,并返回句法结果
    """
    tmp_ner_dict = {}
    num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']

    # print(s)

    # 将公司代码替换为特殊称谓,保证分词词性正确
    for i, ner in enumerate(list(set(re.findall(r'(ner\_\d\d\d\d\_)', s)))):
        try:
            tmp_ner_dict[num_lst[i] + '号企业'] = ner
        except IndexError:
            # TODO:定义错误情况的输出
            # TODO ...
            num_lst.append(str(i))
            tmp_ner_dict[num_lst[i] + '号企业'] = ner

        s = s.replace(ner, num_lst[i] + '号企业')

    # print(tmp_ner_dict)

    words = segmentor.segment(s)
    tags = postagger.postag(words)
    parser = Parser()  # 初始化实例

    parse_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
    parser.load(parse_model_path)

    arcs = parser.parse(words, tags)  # 句法分析
    arcs_lst = list(map(list,
                        zip(*[[arc.head, arc.relation] for arc in arcs])))

    # 句法分析结果输出
    parse_result = pd.DataFrame([[
        a, b, c, d
    ] for a, b, c, d in zip(list(words), list(tags), arcs_lst[0], arcs_lst[1])
                                 ],
                                index=range(1,
                                            len(words) + 1))
    parser.release()  # 释放模型

    result = []

    # 实体的依存关系类别
    rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
    relation = [arc.relation for arc in arcs]  # 提取依存关系
    heads = ['Root' if id == 0 else words[id - 1]
             for id in rely_id]  # 匹配依存父节点词语
    # for i in range(len(words)):
    #     print(relation[i] + '(' + words[i] + ', ' + heads[i] + ')')

    company_list = list(tmp_ner_dict.keys())

    str_enti_1 = "一号企业"
    str_enti_2 = "二号企业"
    l_w = list(words)
    is_two_company = str_enti_1 in l_w and str_enti_2 in l_w
    if is_two_company:
        second_entity_index = l_w.index(str_enti_2)
        entity_sentence_type = parse_result.iloc[second_entity_index, -1]
        if entity_sentence_type in SEN_TAGS:
            result.append(SEN_TAGS.index(entity_sentence_type))
        else:
            result.append(-1)
    else:
        result.append(-1)

    if isGraph:
        g = Digraph('测试图片')
        g.node(name='Root')
        for word in words:
            g.node(name=word, fontname="SimHei")

        for i in range(len(words)):
            if relation[i] not in ['HED']:
                g.edge(words[i],
                       heads[i],
                       label=relation[i],
                       fontname="SimHei")
            else:
                if heads[i] == 'Root':
                    g.edge(words[i],
                           'Root',
                           label=relation[i],
                           fontname="SimHei")
                else:
                    g.edge(heads[i],
                           'Root',
                           label=relation[i],
                           fontname="SimHei")
        g.view()

    # 企业实体间句法距离
    distance_e_jufa = 0
    if is_two_company:
        distance_e_jufa = shortest_path(parse_result,
                                        list(words),
                                        str_enti_1,
                                        str_enti_2,
                                        isGraph=False)
    result.append(distance_e_jufa)

    # 企业实体间距离
    distance_entity = 0
    if is_two_company:
        distance_entity = np.abs(l_w.index(str_enti_1) - l_w.index(str_enti_2))
    result.append(distance_entity)

    # 企业实体分别和关键触发词的距离
    key_words = [
        "收购", "竞拍", "转让", "扩张", "并购", "注资", "整合", "并入", "竞购", "竞买", "支付",
        "收购价", "收购价格", "承购", "购得", "购进", "购入", "买进", "买入", "赎买", "购销", "议购",
        "函购", "函售", "抛售", "售卖", "销售", "转售"
    ]
    # TODO:*根据关键词和对应句法关系提取特征(如没有思路可以不完成)
    # TODO ...

    k_w = None
    for w in words:
        if w in key_words:
            k_w = w
            break

    dis_key_e_1 = -1
    dis_key_e_2 = -1

    if k_w != None and is_two_company:
        k_w = str(k_w)
        # print("k_w", k_w)

        l_w = list(words)
        # dis_key_e_1  = shortest_path(parse_result, l_w, str_enti_1, k_w)
        # dis_key_e_2 = shortest_path(parse_result, l_w, str_enti_2, k_w)

        dis_key_e_1 = np.abs(l_w.index(str_enti_1) - l_w.index(k_w))
        dis_key_e_2 = np.abs(l_w.index(str_enti_2) - l_w.index(k_w))

    result.append(dis_key_e_1)
    result.append(dis_key_e_2)

    return result
示例#9
0
class OpinionExtractor(object):
    def __init__(self):
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__parser = Parser()  # 初始化实例
        self.__labeller = SementicRoleLabeller()  # 初始化实例

        self.__segmentor.load_with_lexicon(
            os.path.join(LTP_MODEL_DIR, "cws.model"),
            os.path.join(DICTIONARY_DIR, "custom_lexicon.model"))
        self.__postagger.load(os.path.join(LTP_MODEL_DIR, "pos.model"))
        self.__parser.load(os.path.join(LTP_MODEL_DIR, "parser.model"))  # 加载模型
        self.__labeller.load(os.path.join(LTP_MODEL_DIR,
                                          "pisrl.model"))  # 加载模型

        self.__adv_dict_list = self.__load_adverb_dictionary()
        self.__adv_list = self.__adv_dict_list.get("范围副词") + self.__adv_dict_list.get("频率副词") \
                          + self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("时间副词") \
                          + self.__adv_dict_list.get("肯否副词") + self.__adv_dict_list.get("语气副词") \
                          + self.__adv_dict_list.get("情态副词")

        self.__pronoun_list = self.__load_pronoun_words()
        self.__vi_list = self.__load_intransitive_verb()
        self.__auxiliary_dict_list = self.__load_auxiliary_dictionary()
        self.__auxiliary_list = self.__auxiliary_dict_list.get(
            "语气助词") + self.__auxiliary_dict_list.get(
                "结构助词") + self.__auxiliary_dict_list.get("时态助词")

        self.__special_prefix_list = self.__load_special_prefix_words()
        self.__stopwords_list = self.__load_stopwords("之前", "是因为", "已经")

    def release(self):
        self.__labeller.release()
        self.__parser.release()
        self.__postagger.release()
        self.__segmentor.release()

    @classmethod
    def __load_stopwords(cls, *self_define_stopwords):
        """
        get stopwords list
        :param self_define_stopwords: add self define stop word to stopwords list
        :return: stopwords_list
        """
        stopwords_list = [
            word.strip()
            for word in open(os.path.join(DICTIONARY_DIR, "stopwords.txt"),
                             "r").readlines()
        ]
        for stopword in self_define_stopwords:
            stopwords_list.append(stopword)
        return stopwords_list

    @classmethod
    def __load_special_prefix_words(cls):
        """
        加载特别开始词
        :return:
        """
        special_prefix_words = []
        with open(os.path.join(DICTIONARY_DIR, "special_prefix.txt"),
                  "r") as sp_file:
            for word in sp_file.readlines():
                special_prefix_words.append(word.strip())
        return special_prefix_words

    @classmethod
    def __load_intransitive_verb(cls):
        """
        加载不及物动词
        :return:
        """
        intransitive_verb = []
        with open(os.path.join(DICTIONARY_DIR, "intransitive_verb.txt"),
                  "r") as vi_file:
            for word in vi_file.readlines():
                intransitive_verb.append(word.strip())
        return intransitive_verb

    @classmethod
    def __load_pronoun_words(cls):
        """
        加载代词
        :return:
        """
        pronoun_words = []
        with open(os.path.join(DICTIONARY_DIR, "pronoun.txt"),
                  "r") as pronoun_file:
            for word in pronoun_file.readlines():
                pronoun_words.append(word.strip())
        return pronoun_words

    @classmethod
    def __load_adverb_dictionary(cls):
        """
        加载副词
        :return:
        """
        dictionary = {}
        with open(os.path.join(DICTIONARY_DIR, "adv.txt"), "r") as adv_file:
            for line in adv_file.readlines():
                index = line.index(":")
                key = line[0:index].strip()
                value = line[index + 1:].strip()
                dictionary.update({key: value.split(" ")})
        return dictionary

    @classmethod
    def __load_auxiliary_dictionary(cls):
        """
        加载助词
        :return:
        """
        dictionary = {}
        with open(os.path.join(DICTIONARY_DIR, "auxiliary.txt"),
                  "r") as adv_file:
            for line in adv_file.readlines():
                index = line.index(":")
                key = line[0:index].strip()
                value = line[index + 1:].strip()
                dictionary.update({key: value.split(" ")})
        return dictionary

    @classmethod
    def __smart_split_sentence(cls, comment):
        """
        拆分句子
        :param comment:
        :return:
        """
        # 替换空格为","
        comment = re.sub(re.compile(r"(\s+)", re.S), ",", comment.strip())
        # 句子按分隔[。|!|,|、|?|.|!|,|?]符分出多个子句
        subcomments = re.split(r'[。|!|,|、|?|\.|!|,|\?]', comment)
        return subcomments

    def sentence_segment_add_space(self, comment, stopwords_list={}):
        """
        使用空格间隔分词
        如:
        我们 喜欢 吃 冰激凌
        :param comment: 一条语料
        :param stopwords_list: 停用词列表
        :return:
        """
        self.__segmentor
        segment = self.__segmentor.segment(self.__remove_special_word(comment))
        return segment, " ".join(segment)

    def __word_self_attention(self, parent_pos, parent_word,
                              current_arc_relation, current_arc_pos,
                              current_word):
        """
        判断词性与依存关系组合的有效性

        词注意力机制
        :param parent_pos: 父节点的词性
        :param parent_word: 父节点的词
        :param current_arc_relation: 当前节点的依存关系
        :param current_arc_pos: 当前节点的词词性
        :param current_word: 当前节点的词
        :return:
        """
        if parent_pos == Pos.v.value:
            if current_arc_relation == Dependency.SBV.value:
                return True
            if current_arc_relation == Dependency.VOB.value:
                return True
            if current_arc_relation == Dependency.FOB.value:
                return True
            if current_arc_relation == Dependency.ADV.value:
                if current_arc_pos == Pos.d.value:
                    if current_word in self.__adv_dict_list.get("肯否副词"):
                        return True
                if current_arc_pos == Pos.p.value and current_word in [
                        "由", "用"
                ]:  # 由关晓彤代言
                    return True
                if current_arc_pos == Pos.v.value:
                    return True
            if current_arc_relation == Dependency.ATT.value:
                return True
            if current_arc_relation == Dependency.CMP.value:
                return True
            # if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get("语气助词") + self.__auxiliary_dict_list.get("时态助词"):
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_list:
                return True
        elif parent_pos == Pos.a.value:
            if current_arc_relation == Dependency.SBV.value and current_word not in self.__pronoun_list:  # e.g.:材料新鲜  它很方便
                return True
            if current_arc_relation == Dependency.ADV.value and (
                    current_word not in self.__adv_dict_list.get("程度副词") +
                    self.__adv_dict_list.get("范围副词") or
                (current_arc_pos == Pos.p.value
                 and current_word in ["比"])):  # 比别家好
                return True
            if current_arc_relation == Dependency.ATT.value:
                return True
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get(
                    "语气助词") + self.__auxiliary_dict_list.get("结构助词"):
                return True
        elif parent_pos in [
                Pos.n.value, Pos.nd.value, Pos.nh.value, Pos.ni.value,
                Pos.nl.value, Pos.ns.value, Pos.nt.value, Pos.nz.value
        ]:
            if current_arc_relation == Dependency.ADV.value:
                return True
            if current_arc_relation == Dependency.ATT.value:  # 属性语义修饰名词
                return True
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get(
                    "语气助词") + self.__auxiliary_dict_list.get("结构助词"):  # 美丽的
                return True
        elif parent_pos == Pos.p.value:
            if current_arc_relation == Dependency.SBV.value:  # 他给我感觉
                return True
            if current_arc_relation == Dependency.VOB.value:  # 给我感觉
                return True
            if current_arc_relation == Dependency.POB.value:  # 比别家好
                return True
        elif parent_pos == Pos.d.value:
            if current_arc_relation == Dependency.SBV.value:
                return True
            if current_arc_relation == Dependency.VOB.value:  # 没有|d  4|过于|d  5|甜腻
                return True
        elif parent_pos in [Pos.i.value, Pos.r.value, Pos.q.value
                            ] or current_arc_relation == Dependency.CMP.value:
            return True
        return False

    def __parse_opinion(self, core_word_index, arcs, words, postags):
        """

        :param core_word_index:
        :param arcs:
        :param words:
        :param postags:
        :return: opinion_word_list
        """
        has_vob = False
        sbv_word = ()
        sbv_att_word_list = []
        available_word_idx_list = [core_word_index]
        opinion_word_list = []

        def word_root_index(core_word_idx, index):
            """
            查找词的root index
            :return:
            """
            arc = arcs[index]
            idx = index if arc.relation == Dependency.HED.value else arc.head - 1
            if idx == core_word_idx or idx == index:
                return idx
            else:
                return word_root_index(core_word_idx, idx)

        def do_parse_opinion(core_word_idx):
            """
            提取以动词为核心的观点,提取的主要结构主谓结构(SBV)、动宾结构(VOB)、状中结构(ADV)、动补结构(CMP)、介宾结构(POB)
            :return:
            """
            nonlocal has_vob
            nonlocal sbv_word
            nonlocal sbv_att_word_list
            nonlocal available_word_idx_list

            for m, arc in enumerate(arcs):
                # tuple格式:(index, 句法依存关系, 词性, 词)
                current_word_tuple = (m, arc.relation, postags[m], words[m])

                parent_word_index = arc.head - 1
                parent_word_tuple = (parent_word_index,
                                     arcs[parent_word_index].relation,
                                     postags[parent_word_index],
                                     words[parent_word_index])

                if arc.head == core_word_idx + 1 \
                        and (current_word_tuple[2] not in [Pos.wp.value, Pos.o.value, Pos.c.value, Pos.r.value, Pos.e.value] or (current_word_tuple[2] == Pos.r.value and current_word_tuple[3] not in self.__pronoun_list)) \
                        and self.__word_self_attention(parent_word_tuple[2], parent_word_tuple[3], current_word_tuple[1], current_word_tuple[2], current_word_tuple[3]):

                    # 计算词的root词是否等于关键词
                    root_core_index = word_root_index(core_word_index, m)
                    if root_core_index == core_word_index:
                        if arc.relation == Dependency.VOB.value or (
                                arc.relation == Dependency.CMP.value and
                                postags[current_word_tuple[0]] == Pos.a.value):
                            has_vob = True
                            available_word_idx_list.append(m)
                            opinion_word_list.append(current_word_tuple)
                        else:
                            if arc.head - 1 in available_word_idx_list:
                                available_word_idx_list.append(m)
                                # 若是主谓结构先暂存,不加入观点词list
                                if arc.relation == Dependency.SBV.value:
                                    if len(sbv_word) == 0:
                                        sbv_word = current_word_tuple
                                else:
                                    # 计算词的root词是否等于sbv关键词
                                    sbv_index = sbv_word[0] if len(
                                        sbv_word) > 0 else -1
                                    root_sbv_index = word_root_index(
                                        sbv_index, current_word_tuple[0])
                                    if root_sbv_index == sbv_index:
                                        # 若是主谓结构的其他属性词,暂存在主谓属性词列表
                                        sbv_att_word_list.append(
                                            current_word_tuple)
                                    else:
                                        opinion_word_list.append(
                                            current_word_tuple)
                    do_parse_opinion(m)

        do_parse_opinion(core_word_index)

        def need_sbv():
            """
            判断是否需要主谓结构
            :return:
            """
            # 三元组判断,只有包含了动宾结构才把主谓结构加入
            if has_vob:
                return True
            # 及物动词可以直接加sbv
            if postags[core_word_index] == Pos.a.value:
                return True
            # 形容词句意可以直接在sbv
            if words[core_word_index] in self.__vi_list:
                return True
            return False

        if need_sbv() and len(sbv_word) > 0:
            opinion_word_list.append(sbv_word)
            opinion_word_list += sbv_att_word_list

        return opinion_word_list

    def extract_opinion(self,
                        comment,
                        distinct_opinion=True,
                        show_core_word=False,
                        show_detail=False):
        """
        抽取观点
        :param comment:
        :param distinct_opinion: 是否去重观点
        :param show_core_word: 是否展示观点核心词
        :param show_detail: 是否展示分词等详细信息
        :return:
        """
        subcomments = self.__smart_split_sentence(comment)
        opinion_list = []
        for subcomment in subcomments:
            words, sentence_with_space = self.sentence_segment_add_space(
                subcomment)
            opinions = self.__parse_segment(words, show_detail)
            if len(opinions) > 0:
                opinion_list += opinions
        if distinct_opinion:
            opinion_list = self.__distinct_opinion(opinion_list)
        if not show_core_word:
            opinion_list = [opinion[2] for opinion in opinion_list]
        return opinion_list

    @classmethod
    def __distinct_opinion(cls, opinions):
        """
        观点去重
        :param opinions:
        :return:
        """
        index = 2
        distinct_opinion_list = []
        for n in range(1, len(opinions)):
            for m in range(n, 0, -1):
                opi_1 = opinions[m][index]
                opi_2 = opinions[m - 1][index]
                if len(opi_1) > len(opi_2):
                    tmp = opinions[m - 1]
                    opinions[m - 1] = opinions[m]
                    opinions[m] = tmp

        for opinion in opinions:
            opi = opinion[index]
            if len(distinct_opinion_list) == 0:
                distinct_opinion_list.append(opinion)
            else:
                include = False
                for idx in range(0, len(distinct_opinion_list)):
                    try:
                        include |= distinct_opinion_list[idx][index].index(
                            opi) > -1
                    except ValueError:
                        pass
                if not include:
                    distinct_opinion_list.append(opinion)

        return distinct_opinion_list

    def __parse_segment(self, words, show_detail=False):
        postags = self.__postagger.postag(words)

        word_tag_tuple_list = []
        for i in range(len(words)):
            word_tag_tuple_list.append((str(i), words[i], postags[i]))
        arcs = self.__parser.parse(words, postags)

        # arcs 使用依存句法分析的结果
        labels = self.__labeller.label(words, postags, arcs)  # 语义角色标注

        if show_detail:
            logger.info("|".join(words))
            logger.info("  ".join('|'.join(tpl)
                                  for tpl in word_tag_tuple_list))
            logger.info("  ".join("%d|%d:%s" % (n, arc.head, arc.relation)
                                  for n, arc in enumerate(arcs)))
            for label in labels:
                logger.info(
                    str(label.index) + ":" + ",".join([
                        "%s:(%d,%d)" %
                        (arg.name, arg.range.start, arg.range.end)
                        for arg in label.arguments
                    ]))

        # opinions = self.__parse_main_opinion(arcs, words, postags)
        opinions = self.__parse_opinions(arcs, words, postags)
        return opinions

    def __parse_opinions(self, arcs, words, postags):
        """
        给出核心词性,解释所有该词性的短语观点
        :param arcs:
        :param words:
        :param postags:
        :return:
        """
        opinions = []
        for n, arc in enumerate(arcs):
            postag = postags[n]
            word = words[n]
            if postag in [Pos.v.value, Pos.a.value, Pos.i.value] or \
                    (postag == Pos.a.value and word not in self.__adv_list) or \
                    (arc.relation in [Dependency.HED.value, Dependency.COO.value] and postag not in [Pos.v.value, Pos.a.value, Pos.i.value, Pos.m.value, Pos.c.value]):
                opinion_word_list = self.__parse_opinion(
                    n, arcs, words, postags)
                if self.__check_opinion(postag, word, opinion_word_list):
                    opinion_str = self.__opinion_to_str(
                        n, words, opinion_word_list)
                    opinions.append((postag, words[n], opinion_str))

        return opinions

    def __parse_main_opinion(self, arcs, words, postags):
        """

        :param arcs:
        :param words:
        :param postags:
        :return:
        """
        for n, arc in enumerate(arcs):
            if arc.relation == Dependency.HED.value:
                core_index = n
        core_pos = postags[core_index]
        opinion_word_list = self.__parse_opinion(core_index, arcs, words,
                                                 postags)
        return core_pos, words[core_index], self.__opinion_to_str(
            core_index, words, opinion_word_list)

    @classmethod
    def __check_opinion(cls, core_word_pos, core_word, opinion_word_list):
        """
        检测opinion有效性
        :param core_word_pos:
        :param core_word:
        :param opinion_word_list:
        :return:
        """
        if len(opinion_word_list) > 0:
            return True
        if len(opinion_word_list) == 0 and core_word_pos not in [
                Pos.v.value, Pos.d.value
        ]:
            return True
        if len(opinion_word_list
               ) == 0 and core_word_pos == Pos.v.value and len(
                   core_word) > 1:  # 入口即化|v
            return True
        return False

    def __opinion_to_str(self, core_word_index, words, opinion_word_list):
        """
        输出观点字符串
        :param core_word_index:
        :param words:
        :param opinion_word_list:
        :return:
        """
        index_list = [core_word_index]
        if self.__remove_core_word(words[core_word_index]):
            index_list = []

        for opinion_word in opinion_word_list:
            index = opinion_word[0]
            index_list.append(index)
        index_list.sort()

        opinion = ""
        for index in index_list:
            opinion += words[index]

        return self.__remove_special_word(opinion)

    @classmethod
    def __remove_core_word(cls, word):
        if word == "是":
            return True
        return False

    def __remove_special_word(self, opinion):
        new_opinion = opinion
        for sp_word in self.__special_prefix_list:
            if opinion.rfind(sp_word) == 0:
                new_opinion = opinion[len(sp_word):]
                return self.__remove_special_word(new_opinion)
        return new_opinion
示例#10
0
def getRelation(paragraph):
    """
	paragraph: a list of string, each string is a sentence
	return: a list of relations and a dict which records the number of occurrence of differents DSNF
	"""
    relations = []
    dict_DSNF = {
        'num_DSNF1': 0,
        'num_DSNF2': 0,
        'num_DSNF3': 0,
        'num_DSNF7': 0,
    }

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))

    for iteration, sentence in enumerate(paragraph):
        print("evaluate the " + str(iteration + 1) + "-th sentences")

        sentence = SentenceSplitter.split(sentence)[0]

        words = segmentor.segment(sentence)
        # print("\t".join(words))

        postags = postagger.postag(words)
        # list-of-string parameter is support in 0.1.5
        # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
        # print("\t".join(postags))

        arcs = parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        netags = recognizer.recognize(words, postags)
        # print("\t".join(netags))

        # labeller = SementicRoleLabeller()
        # labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        # roles = labeller.label(words, postags, arcs)
        # for role in roles:
        #     print(role.index, "".join(
        #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

        entityList = findEntities(netags)
        # print(entityList)
        entities = []
        for i in entityList:
            l = ''
            for j in i:
                l += words[j]
            entities.append(l)
        print("entities in " + str(iteration + 1) + "-th sentence : ",
              entities)

        DSNF1_ret = DSNF1(arcs, entityList, words, netags)
        DSNF2_ret = DSNF2(arcs, entityList, words)
        DSNF3_ret = DSNF3(arcs, entityList, words, postags)
        DSNF7_ret = DSNF7(arcs, entityList, words)
        # print("DSNF1 result: ", DSNF1_ret)
        # print("DSNF2 result: ", DSNF2_ret)
        # print("DSNF3 result: ", DSNF3_ret)
        # print("DSNF7 result: ", DSNF7_ret)
        relation = []
        for r in DSNF1_ret:
            dict_DSNF['num_DSNF1'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF2_ret:
            dict_DSNF['num_DSNF2'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF3_ret:
            dict_DSNF['num_DSNF3'] += 1
            relation.append(r)
            relations.append(r)
        for r in DSNF7_ret:
            dict_DSNF['num_DSNF7'] += 1
            relation.append(r)
            relations.append(r)
        print("with entities relation: ", relation)
        print("--" * 30)

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
    # labeller.release()

    return relations, dict_DSNF
示例#11
0
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
        default_model_dir: str,ltp模型文件目录
    """
    default_user_dict_dir = '../../resource/'  # 默认的用户词典目录,清华大学法律词典
    default_model_dir = '../../model/'  # ltp模型文件目录
    
    def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir):
        self.default_user_dict_dir = user_dict_dir
        self.default_model_dir = model_dir
        # 初始化分词器
        # pynlpir.open()  # 初始化分词器
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            with open(file_path, 'r', encoding='utf-8') as f:
                line = f.readline()
                while line:
                    word = line.strip('\n').strip()
                    jieba.add_word(word)
                    # print(c_char_p(word.encode()))
                    # pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
                    line = f.readline()

        # 加载ltp模型
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

        if postag_flag or ner_flag or parse_flag:
            print('load model failed!')

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas

    def postag(self, lemmas):
        """对分词后的结果进行词性标注
        Args:
            lemmas: list,分词后的结果
            entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i+1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release()  # 释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        post_tag = self.postagger.postag([word, ])
        return post_tag[0]

    def netag(self, words):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        # print('\t'.join(netags))  # just for test
        words_netag = EntityCombine().combine(words, netags)
        # self.recognizer.release()  # 释放
        return words_netag

    def parse(self, words):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        # self.parser.release()
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
示例#12
0
class RequestHandler():
    def __init__(self):
        self.intents = [
            'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel',
            'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message',
            'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule',
            'riddle', 'email', 'contacts', 'bus', 'website', 'datetime',
            'poetry', 'lottery', 'chat', 'epg', 'telephone'
        ]

        self.segmentor = Segmentor()  # 初始化实例 CWS
        self.segmentor.load(configs.cws_path)  # 加载模型
        self.postagger = Postagger()  # 初始化实例 POS Tagger
        self.postagger.load(configs.pos_path)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 初始化实例 SRLer
        self.labeller.load(configs.srl_path)  # 加载模型
        self.parser = Parser()  # 初始化实例   Parser
        self.parser.load(configs.parser_path)  # 加载模型

        self.ac = ACAutomatons()

        self.clf_31 = NBSVM()

        self.char_vectorizer_31 = joblib.load(configs.models_path +
                                              '/nbsvm-vocab-ch.pkl')
        self.word_vectorizer_31 = joblib.load(configs.models_path +
                                              '/nbsvm-vocab-wd.pkl')
        self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl')
        self.ch2_ = joblib.load(configs.models_path +
                                '/nbsvm-feature_selector.pkl')
        self.word_vectorizer_tv = joblib.load(configs.models_path +
                                              '/vocab-wd_epg-tvchannel.pkl')
        self.char_vectorizer_tv = joblib.load(configs.models_path +
                                              '/vocab-ch_epg-tvchannel.pkl')
        self.clf_tv = joblib.load(configs.models_path +
                                  '/svm_epg-tvchannel.pkl')
        self.word_vectorizer_movie = joblib.load(configs.models_path +
                                                 '/vocab-wd_video-cinemas.pkl')

        self.char_vectorizer_movie = joblib.load(configs.models_path +
                                                 '/vocab-ch_video-cinemas.pkl')
        self.clf_movie = joblib.load(configs.models_path +
                                     '/svm_video-cinemas.pkl')
        self.char_vectorizer_internet = joblib.load(
            configs.models_path + '/vocab-ch_website-app.pkl')
        self.word_vectorizer_internet = joblib.load(
            configs.models_path + '/vocab-wd_website-app.pkl')
        self.clf_internet = joblib.load(configs.models_path +
                                        '/svm_website-app.pkl')
        self.char_vectorizer_star = joblib.load(configs.models_path +
                                                '/vocab-ch_video-music.pkl')
        self.clf_star = joblib.load(configs.models_path +
                                    '/svm_video-music.pkl')

        self.word_vectorizer_star = joblib.load(configs.models_path +
                                                '/vocab-wd_video-music.pkl')
        self.char_vectorizer_video = joblib.load(configs.models_path +
                                                 '/vocab-ch_video-epg.pkl')
        self.word_vectorizer_video = joblib.load(configs.models_path +
                                                 '/vocab-wd_video-epg.pkl')
        self.clf_video = joblib.load(configs.models_path +
                                     '/svm_video-epg.pkl')

    def getResult(self, sentence):
        """1. Complete the classification in this function.

        Args:
            sentence: A string of sentence.

        Returns:
            classification: A string of the result of classification.
        """
        processed = self.preprocess(sentence)

        return self.pipeline(processed)

    def getBatchResults(self, sentencesList):
        """2. You can also complete the classification in this function,
                if you want to classify the sentences in batch.

        Args:
            sentencesList: A List of Dictionaries of ids and sentences,
                like:
                [{'id':331, 'content':'帮我打电话给张三' }, 
                 {'id':332, 'content':'帮我订一张机票!' },
                 ... ]

        Returns:
            resultsList: A List of Dictionaries of ids and results.
                The order of the list must be the same as the input list,
                like:
                [{'id':331, 'result':'telephone' }, 
                 {'id':332, 'result':'flight' },
                 ... ]
        """
        resultsList = []
        for sentence in sentencesList:
            resultDict = {}
            resultDict['id'] = sentence['id']
            resultDict['result'] = self.getResult(sentence['content'])
            resultsList.append(resultDict)

        return resultsList

    def pattern_match(self, sample):
        srl_res = self.sRLMatch(sample)
        if srl_res != None:

            return srl_res
        else:
            rul_res = self.ruleMatch(sample)
            if rul_res != None:

                return rul_res
            else:
                return None

    def ruleMatch(self, sample):
        domains = get_rule(sample['query'], self.ac)

        if len(domains) < 1:
            return None
        else:
            sorted_domains = aggregate_domains(domains)

            for each in sorted_domains:
                if each[0] == 'datetime':
                    nouns = get_nouns(sample['query'], 'festival', self.ac)

                    if len(nouns) > 0:
                        return 'datetime'
                    else:
                        continue

                elif each[0] == 'email':
                    if len(
                            set(sample['word'])
                            & set(['写', '回复', '转发', '打开', '查收', '查看', '答复'])
                    ) > 0:
                        return 'email'
                    else:
                        continue

            else:
                return None

    def sRLMatch(self, sample):
        srl_res = getSRL(sample['query'], self.segmentor, self.postagger,
                         self.parser, self.labeller)
        if len(srl_res) == 0:  #no any predicate in query or single entity
            return None
        else:
            for res in srl_res:
                predicate_domains = get_predicate(res[0], self.ac)
                if len(predicate_domains) < 1:
                    continue  #no such a predicate in database
                else:
                    sorted_domains = aggregate_domains(predicate_domains)
                    for each in sorted_domains:
                        if each[0] == 'app':
                            nouns = get_nouns(res[1], 'app', self.ac)
                            if len(nouns) > 0:

                                return 'app'
                            else:
                                continue

                        elif each[0] == 'cinemas':
                            nouns = get_nouns(res[1], 'film', self.ac)
                            if len(nouns) > 0:
                                return 'Movie_stuff'
                            else:
                                continue
                        elif each[0] == 'contacts':
                            # 'nr' by POS-tagger indicates a person's name
                            if 'nr' in sample['tag']:
                                return 'contacts'
                            else:
                                continue

                        elif each[0] == 'cookbook':
                            nouns = get_nouns(res[1], 'food', self.ac)
                            if len(nouns) > 0:  # 如果命中任何专有名词,则划分到意图app

                                return 'cookbook'
                            else:
                                continue

                        elif each[0] == 'tvchannel':
                            nouns = get_nouns(res[1], 'tvchannel', self.ac)
                            if len(nouns) > 0:
                                return 'TV_stuff'
                            else:
                                continue

                        elif each[0] == 'video':
                            nouns = get_nouns(res[1], 'video', self.ac)
                            if len(nouns) > 0:
                                return 'Video_stuff'
                            else:
                                continue

                        elif each[0] == 'health':
                            nouns = get_nouns(res[1], 'disease', self.ac)
                            nouns.extend(get_nouns(res[1], 'drug', self.ac))
                            if len(nouns) > 0:
                                return 'health'
                            else:
                                continue

                        elif each[0] == 'music':
                            nouns_song = get_nouns(res[1], 'song', self.ac)
                            nouns_singer = get_nouns(res[1], 'singer', self.ac)
                            if len(nouns_song) > 0:

                                return 'music'
                            elif len(nouns_singer) > 0:
                                return 'Star_stuff'
                            else:
                                continue

                        elif each[0] == 'novel':
                            nouns = get_nouns(res[1], 'novel', self.ac)
                            if '小说' in res[1] or len(nouns) > 0:

                                return 'novel'
                            else:
                                continue

                        elif each[0] == 'poetry':
                            nouns = get_nouns(res[1], 'poet', self.ac)
                            if len(nouns) > 0:

                                return 'poetry'
                            else:
                                continue

                        elif each[0] == 'radio':
                            if len(get_nouns(res[1], 'radio', self.ac)) > 0:

                                return 'radio'
                            else:
                                continue

                        elif each[0] == 'stock':
                            nouns = get_nouns(res[1], 'stock', self.ac)
                            if len(nouns) > 0:

                                return 'stock'
                            else:
                                continue

                        elif each[0] == 'website':
                            nouns = get_nouns(res[1], 'website', self.ac)
                            if len(nouns) > 0:

                                return 'Internet_stuff'
                            else:
                                continue

    def retrieval(self, sample):
        """
        To find proper nouns to handle single entity in a query
        :param sample: a dict indicates a query and its POS tag
        :return:a string indicates one certain intent
        """
        pn_res = doRetrieval(sample['query'],
                             self.ac)  #look up single instance
        sorted_domains = aggregate_domains(pn_res)
        if len(sorted_domains) == 1:  #one instance
            domain = sorted_domains[0][0]
            if len(max(sorted_domains[0][1],
                       key=len)) > len(sample['query']) / 2:
                if domain == 'airline': return 'flight'
                if domain in ['railwaystation', 'airport']: return 'map'
                if domain == 'app': return 'app'
                if domain == 'contacts': return 'contacts'
                if domain in ['drug', 'disease']: return 'health'
                if domain == 'festival': return 'datetime'
                if domain in ['moviestar', 'film', 'video']: return 'video'
                if domain == 'food': return 'cookbook'
                if domain == 'novel': return 'novel'
                if domain == 'place': return 'map'
                if domain == 'poet': return 'poetry'
                if domain == 'radio': return 'radio'
                if domain in ['singer', 'song']: return 'music'
                if domain == 'sports': return 'match'
                if domain == 'stock': return 'stock'
                if domain == 'tvchannel': return 'tvchannel'
                if domain == 'website': return 'website'
            return None
        else:
            return None

    def classifyAllIntents(self, sample):
        """
        A classifier for 31 intents including chitchat
        :param sample: a dict indicates a query and its POS tag
        :return:a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_31.transform(text)
        test_wd = self.word_vectorizer_31.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        test_vec = self.ch2_.transform(test_vec)
        pred = self.clf_31.predict(test_vec)
        return pred.tolist()[0]

    def epgOrTvchannel(self, sample):
        """
        A classifier to label a instance with 'epg' or 'tvchannel'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_tv.transform(text)
        test_wd = self.word_vectorizer_tv.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_tv.predict(test_vec)
        return pred.tolist()[0]

    def videoOrCinemas(self, sample):
        """
        A classifier to label a instance with 'video' or 'cinemas'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_movie.transform(text)
        test_wd = self.word_vectorizer_movie.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_movie.predict(test_vec)
        return pred.tolist()[0]

    def websiteOrApp(self, sample):
        """
        A classifier to label a instance with 'website' or 'app'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_internet.transform(text)
        test_wd = self.word_vectorizer_internet.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_internet.predict(test_vec)
        return pred.tolist()[0]

    def videoOrMusic(self, sample):
        """
        A classifier to label a instance with 'video' or 'music'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_star.transform(text)
        test_wd = self.word_vectorizer_star.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_star.predict(test_vec)
        return pred.tolist()[0]

    def videoOrEpg(self, sample):
        """
        A classifier to label a instance with 'epg' or 'video'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_video.transform(text)
        test_wd = self.word_vectorizer_video.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_video.predict(test_vec)
        return pred.tolist()[0]

    def pipeline(self, sample, use_pse=True, use_retrieval=False):
        """
        A pipeline to label a instance with one of 31 possible intents
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        if use_pse:
            ps_res = prettySureExpression(sample['query'], self.ac)

            if len(list(set([_[1][0] for _ in ps_res]))) == 1:
                return ps_res[0][1][0]
        pm_res = self.pattern_match(sample)

        if pm_res == 'TV_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['epg', 'tvchannel']:
                return clf_res
            else:
                return self.epgOrTvchannel(
                    sample)  #a ML classifier to label epg or tvchannel

        elif pm_res == 'Movie_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'cinemas']:
                return clf_res
            else:
                return self.videoOrCinemas(sample)

        elif pm_res == 'Internet_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['website', 'app']:
                return clf_res
            else:
                return self.websiteOrApp(sample)

        elif pm_res == 'Star_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'music']:
                return clf_res
            else:
                return self.videoOrMusic(sample)

        elif pm_res == 'Video_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'epg']:
                return clf_res
            else:
                return self.videoOrEpg(sample)

        elif pm_res == None:

            if use_retrieval:
                ret_res = self.retrieval(sample, self.ac)
                if ret_res == None:
                    return self.classifyAllIntents(
                        sample
                    )  # no pattern matched, so that classify it using ML
                else:
                    return ret_res
            else:
                return self.classifyAllIntents(sample)
        else:
            return pm_res

    def preprocess(self, raw_query):
        """
        To segment a raw user query into words and POS-tags it
        :param raw_query: a string generated by a user
        :return: a dict indicate the segmented query ,raw query and POS-tags
        """
        tmp = pseg.cut(raw_query)
        words = []
        pos = []
        for word, flag in tmp:
            words.append(word)
            pos.append(flag)
        inst = {}
        inst['tag'] = pos
        inst['word'] = words
        del words
        del pos
        inst['query'] = raw_query
        return inst

    def close(self):
        """
        To release relevant models
        """
        self.postagger.release()  # 释放模型
        self.segmentor.release()  # 释放模型
        self.labeller.release()  # 释放模型
        self.parser.release()  # 释放模型
        del self.ac
        gc.collect()
示例#13
0
    def split_sentence(self,
                       sentence=None,
                       say_word_list: List[str] = None,
                       cycle: bool = True,
                       ratio: float = None) -> None:
        """
        分词
        :type say_word_list:
        :param sentence:
        :return:
        """
        LTP_DATA_PATH = 'D:\pyltp-master\ltp_data_v3.4.0'

        cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model')
        par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model')

        postagger = Postagger()
        postagger.load(pos_model_path)
        print('Postagger loaded!')
        parser = Parser()
        parser.load(par_model_path)
        print('Parser loaded!')
        segment = Segmentor()
        segment.load(cws_model_path)
        print('CWS loaded!')
        if cycle == True:

            try:
                lines = sentence
                sentence = list(segment.segment(lines))
                # print('sen ok')
                # 找出相似
                find_say_word = [
                    word for word in sentence if word in say_word_list
                ]
                if len(find_say_word) == 0:
                    print('没有发现类似“说”的单词!')
                else:
                    post_word = postagger.postag(sentence)
                    post_word = list(post_word)
                    # print('post ok')
                    parse_word = parser.parse(sentence, post_word)
                    parse_word = [(arc.head, arc.relation)
                                  for arc in parse_word]

                    # print('parse ok')
                    counter_index = 0
                    for index, word in enumerate(parse_word):
                        location_part1 = ''
                        location_part2 = ''
                        location_part3 = ''
                        # 找出第一个SBV下的"真新闻"
                        if word[-1] == 'SBV':
                            counter_index = word[0]
                            location_part1 += sentence[index]
                            location_part1 += sentence[word[0] - 1]
                            break
                    # 先将整个SBV后面碰到是双引号或者没有双引号的句子,用于后面文本向量的模型计算
                    # 暂时只提取双引号内容和两个句号结束的句子为数据
                    if sentence[counter_index] == '"':
                        for index_2, word_2 in enumerate(
                                sentence[counter_index + 1:]):
                            if word_2 == '"':
                                break
                            location_part2 += word_2
                    else:
                        for index_2, word_2 in enumerate(
                                sentence[counter_index:]):
                            if word_2 == '。':
                                for word_4 in sentence[index_2 + 1:]:
                                    if word_4 == '。':
                                        break
                                    location_part3 += word_4
                                break
                            location_part2 += word_2
                    # 判别说前后两个句号句子的相似度
                    cal_ratio = difflib.SequenceMatcher(
                        None, location_part2, location_part3).ratio()
                    if cal_ratio > ratio:
                        result = location_part1 + location_part2 + location_part3
                    else:
                        result = location_part1 + location_part2
                segment.release()
                postagger.release()
                parser.release()
                return result.strip('\n')
            except Exception as e:
                print(e)

        elif cycle == False:
            print('不处理')
        else:
            raise TypeError('错误的输入类型')
        print('词标注和上下文定义结束')
        print('-' * 20, '华丽的分割线', '-' * 20)
示例#14
0
    def extract_comment(self, article, say_words):
        """
        抽取言论
        :param news_path: 新闻路径
        :param say_words: similar to "say"
        :return:result:list[[person, say, comment],...]
        """
        # ltp路径
        LTP_DATA_PATH = '../ltp_data_v3.4.0'

        cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model')
        par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model')

        postagger = Postagger()
        postagger.load(pos_model_path)
        print('Postagger loaded!')
        recognizer = NamedEntityRecognizer()
        recognizer.load(ner_model_path)
        print('NER loaded!')
        parser = Parser()
        parser.load(par_model_path)
        print('Parser loaded!')

        result = []
        sentences = self.cut_sentence(self.token(article))
        for s_index, sentence in enumerate(sentences):
            words = self.cut_word(sentence)
            pos = self.word_pos(sentence, postagger)
            ner_list = self.ner(words, pos, recognizer)
            parse_list = self.dependency_parse(words, pos, parser)
            if 'S-Nh' or 'S-Ni' or 'S-Ns' in ner_list:
                comment = ''
                for p_index, p in enumerate(parse_list):
                    # p[0]-1:说的索引(words,parse_list中都是)
                    # p_index:主语位置

                    if (p[1] == 'SBV') and words[p[0] - 1] in say_words:
                        say = words[p[0] - 1]
                        person = words[p_index]
                        p_i = 1
                        while p_i <= p_index and parse_list[p_index -
                                                            p_i][1] == 'ATT':
                            person = words[p_index - p_i] + person
                            p_i = p_i + 1
                        # 说后是。找前一句话的“”
                        if words[p[0]] == '。':
                            # print('说。')
                            i = 1
                            last_sentence = sentences[s_index - i]
                            last_words = self.cut_word(last_sentence)
                            begin = self.find_str_index(last_words, 0, ['“'])
                            end = self.find_str_index(last_words, 0, ['”'])
                            if begin != -1 and end != -1 and begin < end:
                                comment = ''.join(last_words[begin + 1:end])
                            else:
                                while begin == -1 and end != -1:
                                    i = i + 1
                                    last_sentence = sentences[s_index - i]
                                    last_words = self.cut_word(last_sentence)
                                    begin = self.find_str_index(
                                        last_words, 0, ['“'])
                                while i > 0:
                                    comment = comment + sentences[s_index - i]
                                    i = i - 1
                        else:
                            begin = self.find_str_index(words, p[0], ['“'])
                            end = self.find_str_index(words, p[0], ['”'])
                            if begin != -1 and end != -1 and parse_list[
                                    end - 1][0] == 'WP':
                                comment = ''.join(words[begin:end])
                            elif begin != -1 and end == -1:
                                comment = ''.join(words[begin:])
                                i = 1
                                next_sentence = sentences[s_index + i]
                                while end == -1:
                                    end = self.find_str_index(
                                        self.cut_word(next_sentence), 0, ['”'])
                                    i = i + 1
                                    if len(sentences) > s_index + i:
                                        next_sentence = sentences[s_index + i]
                                    else:
                                        break
                                comments = ''
                                while i > 1 and len(sentences) > s_index + i:
                                    comments = sentences[s_index +
                                                         i] + comments
                                    i = i - 1
                                comment = comment + comments

                            else:
                                # 说后面跟,或:
                                if words[p[0]] == ',' or words[
                                        p[0]] == ',' or words[p[0]] == ':':
                                    # print('说,')
                                    comment = ''.join(words[p[0] + 1:])
                                    # end = self.find_str_index(words, p[0] + 1, ['。', '!'])
                                    # if end != -1:
                                    #     comment = ''.join(words[p[0] + 1:end])
                                    # 说后跟宾语
                                elif parse_list[
                                        p[0]][1] == 'VOB' or parse_list[
                                            p[0]][1] == 'IOB':
                                    print('告诉谁')
                                    i = 0
                                    comment = ''.join(words[p[0] + 1:])
                                    # while len(comment) == 0:
                                    #     end = self.find_str_index(words, p[0] + i, [ '。', '!'])
                                    #     if end != -1:
                                    #         comment = ''.join(words[p[0] + i:end])
                                    #     i = i + 1
                                    # 说后面直接跟内容
                                else:
                                    comment = ''.join(words[p[0]:])
                                    # print('说内容')
                                    # end = self.find_str_index(words, p_index, [ '。', '!'])
                                    # if end != -1:
                                    #     comment = ''.join(words[p[0]:end])

                        print(parse_list)
                        # print(words[p[0]])
                        print(sentence)
                        print('[{}] [{}] [{}]'.format(person, say, comment))
                        print('-' * 50)
                        item = []
                        # item.append(person)
                        # item.append(say)
                        # item.append(comment)
                        result.append([person, say, comment])
                        # result.append(item)

        postagger.release()
        recognizer.release()
        parser.release()

        return result
示例#15
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity("../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag = self.segmentor.load_with_lexicon(os.path.join(default_model_dir, 'cws.model'), user_dict)
        # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        words = self.segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def splitSentence(self,text):
        pattern = r'。|!|?|;|='
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        #    print(result_list)
        return result_list

    def splitSentenceByComma(self,text):
        pattern = r','
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        return result_list

    def not_empty(self,s):
        return s and "".join(s.split())

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []

        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        if self.dsfnConstraints3(sentence,item1,item2) and (item1.dependency == "ATT"):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT"):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (item.dependency == "ATT"):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")")
                allTripes.append([item1.lemma, AttWordStr, item2.lemma])


        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) ==2:
                    num = self.get_entity_num_between(pred1,pred2,sentence)
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        # print("DSFN2三元组:(" + item1.lemma + "," + pred1.lemma + "," + item2.lemma + ")")
                        allTripes.append([item1.lemma, pred1.lemma, item2.lemma])
                        # print("DSFN2三元组:(" + item1.lemma + "," + pred2.lemma + "," + item2.lemma + ")")
                        allTripes.append([item1.lemma, pred2.lemma, item2.lemma])



        """
        DSFN3.0
        """
        pred = None
        if item1.dependency == "SBV" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(), key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            # print(
                            #     "DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred.lemma + "" + objectStr, item2.lemma])
                            # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + object.lemma + "," + item2.lemma + ")")
                            # allTripes.append([item1.lemma, pred.lemma + "" + object.lemma, item2.lemma])
                    if object == None:
                        # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                        allTripes.append([item1.lemma, pred.lemma , item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma])
                else :
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1!=None:
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma])
                        else:
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma  + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred.lemma , item2.lemma])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma])

        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self,verb1,verb2,sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID-1:
            if self.is_entity(sentence.words[i]):
                num +=1
            i +=1
        return num

    def is_entity(self,entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh','ni','ns','nz','j','n','v']
        # print(entry.lemma+" : "+entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False
    def dsfnAttCOO(self,sentence,item1,item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence,item1,item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence,item1COO,item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))
    def dsfn6COO(self,sentence,item1,item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence,item1,item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2
    def dsfn5and6COO(self,sentence,item1,item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence,item1COO,item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe
    def dsfnStartCOO3(self, rawSentence, entity1, entity2,all_entity):
        nounRelatedWithPosition = ['主席','总理','教授','校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        print(sentence.to_string())
        Rawitem1 = None
        Rawitem2 = None
        item1 = None
        item2 = None
        Rawitem1Index = -1
        Rawitem2Index = -1
        indexList = [-1,-1]
        for item in sentence.words:
            if (item.lemma == entity1):
                Rawitem1 = item
            if (item.lemma == entity2):
                Rawitem2 = item
            if Rawitem1 != None and Rawitem2 != None and (Rawitem1.ID!=Rawitem1Index or Rawitem2.ID!=Rawitem2Index):
                Rawitem1Index = Rawitem1.ID
                Rawitem2Index = Rawitem2.ID
                # if item1 == None or item2 == None:
                #     return None
                item1 = Rawitem1
                item2 = Rawitem2
                if item1.ID > item2.ID:
                    c = item1
                    item1 = item2
                    item2 = c
                # print(str(item1.ID) + "   " + str(item2.ID))
                itemCopy1 = item1
                itemCopy2 = item2
                if self.dsfnConstraints2(sentence,item1,item2,all_entity) == False:
                    continue
                allTripes = self.dsfnStartCOO2(sentence,item1,item2)
                # print("111"+item2.lemma)
                if allTripes!=None and len(allTripes) == 0:
                    # return None
                    # if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'] and item1.dependency == "ATT":
                    #     item1 = item1.head_word
                    while item1.dependency == "ATT":
                        item1 = item1.head_word
                        # if 'n' in item1.postag and item1.postag not in ['nh', 'ns', 'nz', 'ni']:
                        #     pass
                        # else:
                        #     item1 = itemCopy1

                    # if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'] and item2.dependency == "ATT":
                    #     item2 = item2.head_word
                    while item2.dependency == "ATT":
                        item2 = item2.head_word
                    allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                    if len(allTripes) != 0:
                        for tripe in allTripes:
                            if tripe[1]!= "":
                                if tripe[0] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[0] = item1.lemma+""+itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[0] = itemCopy1.lemma+""+item1.lemma
                                    else:
                                        tripe[0] = itemCopy1.lemma

                                elif tripe[2] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[2] = item1.lemma+""+itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[2] = itemCopy1.lemma+""+item1.lemma
                                    else:
                                        tripe[2] = itemCopy1.lemma
                                    # tripe[2] = itemCopy1.lemma

                                if tripe[0] == item2.lemma:
                                    if item2.ID < itemCopy2.ID:
                                        tripe[0] = item2.lemma + ""+ itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[0] = itemCopy2.lemma + ""+ item2.lemma
                                    else:
                                        tripe[0] = itemCopy2.lemma
                                elif tripe[2] == item2.lemma:
                                    # print(item2.lemma)
                                    if item2.ID < itemCopy2.ID:
                                        tripe[2] = item2.lemma + ""+ itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[2] = itemCopy2.lemma + ""+ item2.lemma
                                    else:
                                        tripe[2] = itemCopy2.lemma
                                # print("12345")
                                resultList.append(tripe)
                else:
                    for tripe in allTripes:
                        if tripe[1]!="":
                            resultList.append(tripe)
                    # if len(resultList) > 0:
                    #     return np.array(set([tuple(t) for t in resultList]))
        if item1 == None or item2 == None:
            return None
        if len(resultList) > 0:
            return np.array(set([tuple(t) for t in resultList]))
    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        # print(item1.lemma)
        # print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            # print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                # print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    # print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        # print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item
                            allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(sentence, item1, item2)
                                    if allTripes == None or len(allTripes) == 0:
                                        # print("3333333")
                                        allTripes = self.dsfn5and6COO(sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    resultList.append(tripe)
        # print(np.array(set([tuple(t) for t in resultList])))
        return resultList

    def dsfnConstraints1(self,rawSentence,maxLength):
        """
        :param rawSentence: 原句子
        :param maxLength: 句子的最大长度
        :return: 小于maxLength的长度
        """
        newSentence = []
        if len(rawSentence) <= maxLength:
            newSentence.append(rawSentence)
            return newSentence
        else:
            newSentence = self.splitSentenceByComma(rawSentence)
            return newSentence

    def dsfnConstraints2(self,sentence,item1,item2,allEntities):
        countEntity = 0
        countChar = 0
        for index in range(item1.ID+1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
            if word.lemma in allEntities:
                countEntity +=1
        if countEntity > 3:
            return False
        elif countChar > 12:
            return False
        else:
            return True

    def dsfnConstraints3(self,sentence,item1,item2):
        countChar = 0
        for index in range(item1.ID+1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
        if countChar > 5:
            return False
        else:
            return True
    def getSPO(self,sentence_list):
        for sentence in sentence_list:
            RawSentence = sentence
            lemmas = self.segment(sentence)
            words = self.postag(lemmas)
            words_netag = self.netag(words)
            sentence = self.parse(words_netag)
            print(sentence.to_string())
            for item in sentence.words:
                if (item.head_word == None and item.postag == "v" ) or (item.postag == "v" and
                                                                      item.dependency == "COO" and item.head_word.head_word == None):
                    relation_verb = item
                    if item.head_word==None:
                        verbId = item.ID
                        verbId2 = None
                    elif item.head_word.head_word == None:
                        verbId = item.ID
                        verbId2 = item.head_word.ID
                    O_dict = dict()
                    S_dict = dict()
                    OBJ = None
                    SUB = None
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma
                        if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\
                                or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                    and item.head_word.head_word.ID== verbId) or(item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId):
                            OBJ = item
                            O_dict[OBJ.ID] = OBJ.lemma
                            # if item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" \
                            #             and item.head_word.head_word.ID == verbId:
                            #     verb_p = item.head_word
                            # O_dict[OBJ.lemma] = OBJ.ID
                    if SUB == None:
                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verbId2:
                                # if SUB == None or SUB.lemma != entity:
                                SUB = item
                                S_dict[SUB.ID] = SUB.lemma
                    if OBJ == None:
                        for item in sentence.words:
                            if item.dependency == "VOB" and item.head_word.ID == verbId2:
                                OBJ = item
                                O_dict[OBJ.ID] = OBJ.lemma

                    OBJList = []
                    flag = True
                    while flag == True:
                        len1 = len(S_dict)
                        len2 = len(O_dict)
                        for item in sentence.words:
                            if SUB !=None and item.head_word!=None:
                                SUBList = S_dict.keys()
                                if item.head_word.ID in SUBList and (item.dependency =="ATT"
                                        or item.dependency == "COO"):
                                    SUBATT = item
                                    S_dict[SUBATT.ID] = SUBATT.lemma
                            if OBJ != None and item.head_word != None:
                                OBJList = O_dict.keys()
                                if item.head_word.ID in  OBJList and (item.dependency == "ATT" or item.dependency == "COO")  :
                                    OBJATT = item
                                    # if item.dependency!="COO":
                                    O_dict[OBJATT.ID] = OBJATT.lemma
                                    # else:
                                    #     O_dict[OBJATT.ID] = OBJATT.lemma+" "

                            if len(S_dict)!=len1 or len(O_dict)!=len2:
                                flag = True
                            else:
                                flag = False
                    O_dict = sorted(O_dict.items(), key=lambda item: item[0])
                    S_dict = sorted(S_dict.items(), key=lambda item: item[0])
                    Object = ""
                    Subject = ""
                    for i in O_dict:
                        Object += i[1]
                    for i in S_dict:
                        Subject += i[1]
                    if SUB != None :
                        print(RawSentence)
                        print((Subject, relation_verb.lemma, Object))

                    S_dict2 = dict()
                    O_dict2 = dict()
                    SUB_COO = None
                    OBJ_COO = None
                    for item in sentence.words:
                        if item.head_word != None:
                            if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID:
                                # if SUB == None or SUB.lemma != entity:
                                SUB_COO = item
                                S_dict2[SUB_COO.ID] = SUB_COO.lemma
                        if item.head_word != None and OBJ!=None:
                            if item.dependency == "COO" and item.head_word.ID == OBJ.ID:
                                OBJ_COO = item
                                O_dict2[OBJ_COO.ID] = OBJ_COO.lemma

                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if SUB_COO != None and item.head_word != None:
                                SUBList = S_dict2.keys()
                                if item.head_word.ID in SUBList and item.dependency == "ATT":
                                    SUBATT = item
                                    S_dict2[SUBATT.ID] = SUBATT.lemma
                            if OBJ_COO != None and item.head_word != None:
                                OBJList = O_dict2.keys()
                                if item.head_word.ID in OBJList and item.dependency == "ATT":
                                    OBJATT = item
                                    O_dict2[OBJATT.ID] = OBJATT.lemma
                            if len(S_dict2) != len1 or len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                    if len(O_dict2) or len(S_dict2):
                        if len(O_dict2) == 0:
                            O_dict2 = O_dict
                        if len(S_dict2) == 0:
                            S_dict2 = S_dict

                        Object = ""
                        Subject = ""
                        for i in O_dict2:
                            Object += i[1]
                        for i in S_dict2:
                            Subject += i[1]
                        if SUB != None:
                            print("11111111111111111111111"+RawSentence)
                            print((Subject, relation_verb.lemma, Object))

    def getSPO2(self,sentence_list):
        all_result = []
        raw_sentence = []
        for sentence in sentence_list:
            RawSentence = sentence
            lemmas = self.segment(sentence)
            words = self.postag(lemmas)
            words_netag = self.netag(words)
            sentence = self.parse(words_netag)
            # print(sentence.to_string())
            for itemWord in sentence.words:
                #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系
                if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and
                                                                      itemWord.dependency == "COO" and itemWord.head_word.head_word == None):
                    relation_verb = itemWord   #将找到的这个动词,作为relation_verb
                    relationString = relation_verb.lemma
                    if itemWord.head_word==None:
                        verbId = itemWord.ID   #关系动词的ID
                        verbId2 = None
                    elif itemWord.head_word.head_word == None:
                        verbId = itemWord.ID   #该关系动词的ID
                        verbId2 = itemWord.head_word.ID   #这句话的HED,用来找SUB
                    O_dict = dict() #存储所有的Object
                    S_dict = dict() #存储所有的Subject
                    verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲
                    OBJ = None
                    SUB = None
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item #找到主语
                            S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中

                        if (item.dependency == "VOB" and item.head_word.ID == verbId):
                            # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                            OBJ = item
                            O_dict[OBJ.ID] = OBJ.lemma
                            relationString = relation_verb.lemma
                            verb_dict[OBJ.ID] = relationString
                        if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                    and item.head_word.head_word.ID== verbId) :
                            # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                            OBJ = item
                            O_dict[OBJ.ID] = OBJ.lemma
                            relationString = relation_verb.lemma + "" + item.head_word.lemma
                            verb_dict[OBJ.ID] = relationString

                        if (item.dependency == "POB" and item.head_word.postag == "p"\
                            and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId):
                            # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                            OBJ = item
                            O_dict[OBJ.ID] = OBJ.lemma
                            relationString  = relation_verb.lemma
                            for eachWord in sentence.words:
                                if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID:
                                    relationString = relation_verb.lemma + "" + eachWord.lemma
                            verb_dict[OBJ.ID] = relationString

                    if SUB == None:#如果没找到主语,那么就找与该动词并列的verbId2的主语
                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verbId2:
                                # if SUB == None or SUB.lemma != entity:
                                SUB = item
                                S_dict[SUB.ID] = SUB.lemma

                    if OBJ == None:
                        verb_coo = None
                        for item in sentence.words:
                            if item.dependency == "COO" and item.head_word.ID == verbId:
                                verb_coo = item
                                break
                        flag = True
                        if verb_coo != None and self.get_entity_num_between(relation_verb,verb_coo,sentence) == 0:

                            for item in sentence.words:
                                if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID:
                                    flag = False
                            if flag!= False:
                                for item in sentence.words:
                                    if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\
                                            or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                    and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\
                            and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID):

                                        OBJ = item
                                        O_dict[OBJ.ID] = OBJ.lemma

                    SUB_COO = None
                    OBJ_COO = None
                    for item in sentence.words:
                        if item.head_word != None:
                            if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: #获得主语的COO
                                SUB_COO = item
                                S_dict[SUB_COO.ID] = SUB_COO.lemma
                        if item.head_word != None and OBJ!=None:
                            if item.dependency == "COO" and item.head_word.ID == OBJ.ID: #获得宾语的COO
                                OBJ_COO = item
                                O_dict[OBJ_COO.ID] = OBJ_COO.lemma
                    S_new = []

                    for sub in S_dict:
                        if sentence.get_word_by_id(sub).postag == 'r':
                            continue
                        S_dict2 = dict()  # 存放主语ATT的列表
                        S_dict2[sub] = S_dict[sub]
                        flag = True
                        while flag == True:
                            len1 = len(S_dict2)
                            for item in sentence.words:
                                if item.head_word != None:
                                    SUBList = S_dict2.keys()
                                    if item.head_word.ID in SUBList and (item.dependency == "ATT" or item.dependency == "ADV"):
                                        SUBATT = item
                                        S_dict2[SUBATT.ID] = SUBATT.lemma

                                if len(S_dict2) != len1 :
                                    flag = True
                                else:
                                    flag = False
                        S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                        Subject = ""
                        for i in S_dict2:
                            Subject += i[1]
                        S_new.append(Subject)

                    O_new = []
                    V_new = []
                    for obj in O_dict:
                        if sentence.get_word_by_id(obj).postag == 'r':
                            continue
                        O_dict2 = dict()  # 存放宾语ATT的列表
                        O_dict2[obj] = O_dict[obj]
                        if verb_dict!=None:
                            if obj in verb_dict:
                                relationString2  = verb_dict[obj]
                            else:
                                relationString2 = relation_verb.lemma
                        else:
                            relationString2 = relation_verb.lemma
                        V_new.append(relationString2)
                        flag = True
                        while flag == True:
                            len2 = len(O_dict2)
                            for item in sentence.words:
                                if item.head_word != None:
                                    OBJList = O_dict2.keys()
                                    if item.head_word.ID in OBJList and (item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB"):
                                        OBJATT = item
                                        O_dict2[OBJATT.ID] = OBJATT.lemma

                                if len(O_dict2) != len2:
                                    flag = True
                                else:
                                    flag = False #一直循环,直到找不到新的修饰词
                        O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                        Object = ""
                        for i in O_dict2:
                            Object += i[1]
                        O_new.append(Object)

                    for sub in S_new:
                        for i in range(0,len(O_new)):
                            obj = O_new[i]
                            relationWord = V_new[i]
                            if obj != "":
                                # print(RawSentence)
                                # print((sub, relationWord, obj))
                                all_result.append([sub,relationWord,obj])
                                raw_sentence.append(RawSentence)
        return all_result,raw_sentence

    def hasEntity(self,word,allEntity):
        for entity in allEntity:
            if entity in word:
                # print(entity)
                return True
        return False

    def PostProcessSPO(self,rawSentence,allTripes,allEntity):
        for i in range(0,len(allTripes)):
            tripe = allTripes[i]
            sub = tripe[0]
            obj = tripe[2]
            # print(sub)
            # print(obj)
            if self.hasEntity(sub,allEntity) and self.hasEntity(obj,allEntity):
                print(rawSentence[i])
                print(tripe)
示例#16
0
class Extractor():

    def __init__(self):
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__words_full_list = []
        self.__netags_full_list = []
        self.load()

    @property
    def triple_list(self):
        return self.__triple_list

    def load(self):
        ltp_dir=conf.get('config','ltp_dir')
        self.__segmentor.load(ltp_dir+'cws.model')
        self.__postagger.load(ltp_dir+'pos.model')
        self.__recognizer.load(ltp_dir+'ner.model')
        self.__parser.load(ltp_dir+'parser.model')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []

    def chunk_str(self, data):
        self.clear()
        sents = SentenceSplitter.split(data.strip())
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                
            except Exception as e:
                print(str(e))
                offset += len(list(words))
        return [t.to_list() for t in self.__triple_list]

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i+1 for i,x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [i+1 for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO']
        relations.insert(0,root)

        for rel in relations:
            e1=None
            left_arc = [i+1 for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV']
            if len(left_arc) == 0:
                for i in range(rel-2,-1,-1):
                    x=arcs[i]
                    if x.head == rel:
                        left_arc=[i+1]
                        break
            if len(left_arc) > 0:
                left_arc = left_arc[-1]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1, [words[i] for i in range(leftmost-1, left_arc)], offset + leftmost-1)

            right_arc = [i+1 for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB']

            e2_list = []
            if not right_arc:
                e2 = None
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [i+1 for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO']
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)


                    e2 = None

                    if count == 0:
                        e2 = Entity(2, [words[i] for i in range(leftmost-1, right_ext)], offset+leftmost-1)
                    else:
                        p1 = range(leftmost-1, right_arc[0]-1)
                        p2 = range(item-1, find_farthest_vob(arcs, item))
                        e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    count += 1
            for e2 in e2_list:
                if e1==None:
                    e1=Entity(1,[])
                if e2==None:
                    e2=Entity(2,[])
                r=Relation(words[rel-1])
                t=Triple(e1,e2,r)
                self.__triple_list.append(t)
示例#17
0
class myLTP:
    def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'):
        self.LTP_DATA_DIR = LTP_DATA_DIR
        self.ne_pattern = self._read_ne_pattern(pattern_dir)

    def _read_ne_pattern(self, filename):
        ne_pattern = []
        with open(filename, encoding='utf8') as filein:
            for line in filein:
                if line[0] != '#':
                    np = line.split()[:2]
                    ne_pattern.append(np)
        return ne_pattern

    def find_ne_by_pattern(self, text):
        ne_dic = defaultdict(list)
        for ne_type, pattern in self.ne_pattern:
            nes = re.findall(pattern, text)
            text = re.sub(pattern, ne_type, text)
            ne_dic[ne_type].extend(nes)
        return text, ne_dic

    def load(self, index=[1, 1, 1, 1, 1]):
        """分词 词性标注 命名实体识别 句法分析 语义角色分析"""
        if index[0]:
            cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
            self.segmentor = Segmentor()
            self.segmentor.load(cws_model_path)

        if index[1]:
            pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
            self.postagger = Postagger()
            self.postagger.load(pos_model_path)

        if index[2]:
            ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
            self.recognizer = NamedEntityRecognizer()
            self.recognizer.load(ner_model_path)

        if index[3]:
            par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
            self.parser = Parser()
            self.parser.load(par_model_path)

        if index[4]:
            srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model')
            self.labeller = SementicRoleLabeller()
            self.labeller.load(srl_model_path)

    def release(self):
        try:
            self.segmentor.release()
        except:
            pass
        try:
            self.postagger.release()
        except:
            pass
        try:
            self.recognizer.release()
        except:
            pass
        try:
            self.parser.release()
        except:
            pass
        try:
            self.labeller.release()
        except:
            pass

    def split_sentence(self, text):
        """分句"""
        return SentenceSplitter.split(text)

    def word_segment(self, sentence):
        """使用结巴分词"""
        # words = self.segmentor.segment(sentence)
        words = jieba.cut(sentence)
        return list(words)

    def pos_tag(self, words):
        """词性标注"""
        postags = self.postagger.postag(words)
        return postags

    def named_entity_recognize(self, words, postags):
        """命名实体识别"""
        netags = self.recognizer.recognize(words, postags)
        return netags

    def parse(self, words, postags):
        """句法分析"""
        arcs = self.parser.parse(words, postags)  # (arc.head, arc.relation)
        return arcs

    def sementic_role_label(self, words, postags, arcs):
        """语义角色分析"""
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def _get_ne_for_sentence(self, sentence):
        """获取实体,包括通过正则表达式定义的一些实体"""

        sentence, ne_dic = self.find_ne_by_pattern(sentence)
        words = list(self.word_segment(sentence))
        postags = self.postagger.postag(words)
        ners = self.named_entity_recognize(words, postags)
        res = {}
        res['words'] = words
        res['ners'] = []
        for index, ner in enumerate(ners):
            if ner != 'O':
                if ner[0] in ('S', 'B'):
                    res['ners'].append([ner[2:], index, index + 1])
                else:
                    res['ners'][-1][-1] += 1
        for ner_type, v in ne_dic.items():
            v = iter(v)
            if v:
                for index, word in enumerate(words):
                    if word == ner_type:
                        words[index] = v.__next__()
                        res['ners'].append([ner_type, index, index + 1])
        return res

    def _get_dne_for_sentence(self, sentence):
        res = []
        s = self._get_ne_for_sentence(sentence)
        ners = s['ners']
        words = s['words']
        for entity1, entity2 in combinations(ners, 2):
            res.append((entity1, entity2, words))
        return res

    def get_dne(self, text):
        """获取实体对,人名(Nh)地名(Ns)机构名(Ni)"""
        res = []
        sentences = self.split_sentence(text)
        for sentence in sentences:
            r = self._get_dne_for_sentence(sentence)
            res.extend(r)
        return res
class MyLtp:

    def __init__(self):
        self.postagger = Postagger()
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        self.postagger.load(pos_model_path)

        self.recognizer = NamedEntityRecognizer()
        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
        self.recognizer.load(ner_model_path)

        self.parser = Parser()
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
        self.parser.load(par_model_path)

    def clean(self):
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    # 寻找依存树根节点编号
    def get_dependtree_root_index(self, word_list):
        # 词性标注
        postags = self.postagger.postag(word_list)
        # print(list(postags))

        # 命名实体识别
        netags = self.recognizer.recognize(word_list, postags)
        # print(list(netags))

        # 句法依存关系
        arcs = self.parser.parse(word_list, postags)
        # print(' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        for i in range(len(arcs)):
            if arcs[i].head == 0:
                return i, postags, arcs  # 同时返回词性及依存关系列表
        return -1, postags, arcs

    # 寻找依存关系子节点
    def get_child_index(self, ind, arcs):
        ret = []
        for i in range(len(arcs)):
            if arcs[i].head == ind + 1:
                ret.append(i)

        return ret

    # 获取命名实体索引
    def get_ne_index(self, postags, chd_list):
        ret = []
        for i in chd_list:
            if postags[i] in ['n', 'nh', 'ni']:
                ret.append(i)
        return ret

    # 获取中心词之后的第一个符号的索引
    def get_first_wp_after_index(self, postags, after):
        for i in range(after + 1, len(postags)):
            if postags[i] == 'wp':
                return i
        return 0

    # 获取句号索引列表
    def get_periods_index_after(self, word_list, after):
        ret = []
        for i in range(after + 1, len(word_list)):
            if word_list[i] in ['。', '?', '!']:
                ret.append(i)
        return ret

    # 获取长句中的分句,为下面的句子向量分析作准备
    def get_sent_list(self, word_list, start, periods):
        ret = []
        if len(periods) == 0:
            ret.append(list(word_list[start + 1:]))
        for i, p in enumerate(periods):
            if i == 0:
                ret.append(list(word_list[start + 1:p + 1]))
            else:
                ret.append(list(word_list[periods[i - 1] + 1:p + 1]))
        return ret

    # # 获取语料库TF-IDF vectorizer
    # def get_tfidf_vectorizer(self, corpus_file):
    #     corpus = []
    #     with open(corpus_file, 'r', encoding='utf-8') as f:
    #         while True:
    #             line = f.readline()
    #             l = line.strip()
    #             if l:
    #                 corpus.append(l)
    #             else:
    #                 break
    #
    #     vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')  # 不过滤单汉字
    #     X = vectorizer.fit_transform(corpus)
    #     return vectorizer

    # # 获取句子向量
    # def get_sentence_vec(self, vectorizer, word_list):
    #     trans = vectorizer.transform([' '.join(word_list)])
    #     return trans.toarray()[0]

    # words: 要识别的内容词语列表, talk_sims: “说”的近义词
    def get_character_speech(self, words, talk_sims):
        # 获取中心词,词性列表,依存关系表
        root_index, postags, arcs = self.get_dependtree_root_index(words)
        # print('index:', root_index)
        # print('len words:', len(words))
        # print('root:', words[root_index])

        # 中心词不在近义词列表,返回空值
        if words[root_index] not in talk_sims:
            return '', '', []

        wp_index = self.get_first_wp_after_index(postags, root_index)
        if wp_index == 0: wp_index = root_index
        # print('wp_index:', wp_index)

        sent_split_idx = self.get_periods_index_after(words, wp_index)
        # print('split:', sent_split_idx)

        # 分句
        sents = self.get_sent_list(words, wp_index, sent_split_idx)
        # print('sents: ', sents)
        # for sen in sents:
        #     print('sen: ', sen)

        # 获取完整命名实体,针对命名实体词被分割的情况
        children = self.get_child_index(root_index, arcs)
        # print(children)

        ne_list = self.get_ne_index(postags, children)

        oth = []
        for ne in ne_list:
            nechd = self.get_child_index(ne, arcs)
            oth.append(self.get_ne_index(postags, nechd))

        # print('ne_list: ', ne_list)
        # print('oth: ', oth)

        if ne_list:
            for i, n in enumerate(ne_list):
                if oth[i]:
                    ne = words[oth[i][0]] + words[n]
                    # print(words[oth[i][0]] + words[n])
                else:
                    ne = words[n]
                    # print(words[n])
                return ne, words[root_index], sents
        else:
            return '', '', []
class SentenceParser:
    def __init__(self):
        # LTP_DIR = './ltp_data_v3.4.0'
        print("加载模型路径", LTP_DIR)
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print("加载完毕")

    '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典'''
    '''
        句法分析中,每个只有一个入度(可能吧),可能有多个出度。
        为了可以结构化的展示分析结果,或者说方便提取信息。
        对每个词建立一个子节点的字典:
            1) 若该词的出度为0,字典为NULL
            2) 若该词的出度为n,那字典的元素个数为n
    '''

    def build_parse_child_dict(self, words, postags, arcs):
        """
        格式化句法分析结果
        :param words: 分词结果
        :param postags: 词性标注结果
        :param arcs: 句法分析结果
        :return: child_dict_list, format_parse_list
        """
        '''
        arcs是一个列表:
            列表元素当前单词,每个元素arc包含arc.head, arc.relation信息,
            head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系
            *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息
            
        LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个
        返回:
            child_dict_list:是表示每个词的出度信息,就是子节点信息
            format_parse_list:每个词信息格式化:  与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性
        '''

        child_dict_list = []
        format_parse_list = []

        # 对每个词建立子节点信息
        for index in range(len(words)):
            child_dict = dict()
            ## 遍历寻找该词的子节点
            for arc_index in range(len(arcs)):
                ## 如果有指向该词的子节点,则加入child_dict
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)

            child_dict_list.append(child_dict)

        # 对每个词建立指定信息
        ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS]  # 还可以加上词的NER信息
        rely_id = [arc.head for arc in arcs]  # 提取每个词依存父节点id(其中id为0的是Root)
        relation = [arc.relation for arc in arcs]  # 提取每个词依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''语义角色标注'''
    '''
        只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。
    '''

    def format_labelrole(self, words, postags):
        """
        格式化语义角色标注结果
        :param words:
        :param postags:
        :return:
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        '''
        roles中有多个role,每个role代表句子中的一个谓词
            role.index 代表谓词的索引, 
            role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词)
                arg.name 表示语义角色类型,
                arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始)
                arg.range.end 表示该语义角色结束词位置的索引。
        roles={
            'r1':{
                'args1':{
                    'name': 语义角色类型,
                    'range':{
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2':{
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            'r2':{
                'args1': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            ...
        }
        '''
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    def close(self):
        """关闭与释放模型"""
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()

    '''parser主函数'''
    '''
    将模型的输出进行处理,方便之后数据处理
        模型输出:words, postags, ners, arcs, roles
        处理后信息:
            child_dict_list:句法分析,每个词的子节点信息
            format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一)
            roles_dic:
    '''

    def parser_main(self, sentence):
        '''words, postags, ners, arcs 为LTP模型输出'''
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        ners = list(self.recognizer.recognize(words, postags))
        arcs = self.parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        """
        arcs中有多个arc
            arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3…
            arc.relation 表示依存弧的关系。
            注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度)
        """
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)

        return words, postags, ners, child_dict_list, format_parse_list, roles_dict
class EventInfoExtract():
    def __init__(self,modulePath,outfile):
        self.MODELDIR = modulePath
        self.adict = {
        '·' :'',
        '的':'',
        '了':'',
        '“':'',
        '”':'',
        '一次':''
        }
        self.segmentor=None
        self.postagger=None
        self.parser=None
        self.recognizer=None
        self.out_file=outfile
        
        
    def multiple_replace(self,text):  
         rx = re.compile('|'.join(map(re.escape, self.adict)))  
         def one_xlat(match):  
               return self.adict[match.group(0)]  
         return rx.sub(one_xlat, text) 


    def InitModule(self):
        #print "正在加载LTP模型... ..."
        self.segmentor = Segmentor()
        #print os.path.join(self.MODELDIR, "cws.model")
        self.segmentor.load("./3.3.0/ltp_data/cws.model")   #分词模型,单文件
        
        self.postagger = Postagger()
        self.postagger.load("./3.3.0/ltp_data/pos.model") #词性标注模型,单文件
        
        self.parser = Parser()
        self.parser.load( "./3.3.0/ltp_data/parser.model")  #依存句法分析模型,单文件
        
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load("./3.3.0/ltp_data/ner.model") #命名实体识别模型,单文件
        #print self.recognizer

    def release_module(self):
        '''
        release the model
        '''
        self.segmentor.release()
        self.segmentor=None
        self.postagger.release()
        self.postagger=None
        self.parser.release()
        self.parser=None
        self.recognizer.release()
        self.recognizer=None
        

    def Txtextraction_start(self,txt,out_file):
        """
        事实三元组的控制程序
        Args:
            txt:带抽取的内容
        """
        txt = txt.strip()
        out_file = open(self.out_file, 'a')
        #try:
        #print "Execute here====-===="
        self.fact_triple_extract(txt,out_file)
        out_file.flush()
        out_file.close()
    
    def addresssTime_extract(self,inputtxt):
        #这个地方先做实体抽取,提取出人物、组织和相关的时间,首先分词,得到分词结果
        #words = self.segmentor.segment(inputtxt)
        sentences = inputtxt.split('。')
        #print sentences
        DataAndTime=[]
        for sentence in sentences:
            if len(sentence)<=1:
                continue
            #sentence = u"北京是中国首都"
            words = self.segmentor.segment(sentence)
            #print '\t'.join(words)
            postags = self.postagger.postag(words)
            netags = self.recognizer.recognize(words, postags)
            #print '\t'.join(postags)
            arcs = self.parser.parse(words, postags)
            #print "sentence;===========132123123123123"
            Dt={'date':'','address':''}
            if (("发生" in sentence or "遭" in sentence) and ("爆炸" in sentence or "事件" in sentence or "袭击" in sentence )) or (("恐怖" in sentence) or ("袭击" in sentence)):
                Flag=False
                #print '\t'.join(words)
                #print '\t'.join(postags)
                #print '\t'.join(postags)
                Addressbackups=[]
                Address =''
                for i in range(len(postags)-1):
                    if Flag==True:
                        if postags[i]=='ns'or postags[i]=='nd' or postags[i]=='n': # ns 地理名 nd方向名词 n一般名词
                            head = arcs[i].head
                            Address=Address+words[i]
                            if postags[head-1]=="n":
                                Address+=words[head-1]
                                head = arcs[head-1].head
                            if(words[head-1]=="在" or words[head-1]=="发生" or  words[head-1]=="袭击"  or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"):
                                Dt['address']=Address
                                break
                        else:
                            print "地址,",Address
                            Addressbackups.append(Address)
                            Address=''
                            Flag=False
                        continue
                    if postags[i]=='ns' and Flag == False:
                        #这个地方只会第一次进来。
                        head = arcs[i].head
                        Address = Address+words[i]
                        if (words[head-1]=="在" or words[head-1]=="发生" or  words[head-1]=="遭" or words[head-1]=="遭遇"  or words[head-1]=="将"):
                            Dt['address']=Address
                            break
                        #if postags[i+1]!='ns' or postags[i+1]!='nd' or postags[i+1]!='n':
                        #    print "wewewerwer====,",Address
                        #    Addressbackups.append(Address)
                        Flag = True 
                #print Addressbackups[0]
            if ("月" in sentence or '日' in sentence) and ("发生" in sentence or "袭击" in sentence):
                Flag = False
                Date=''
                Datebackup=[]
                for i in range(len(postags)-1):
                    if Flag==True:
                        if postags[i]=='nt':
                            #print words[i]
                            head = arcs[i].head
                            Date=Date+words[i]
                            if words[head-1]=="发生" or words[head-1]=="袭击":
                                Dt['date']=Date
                                break
                        else:
                            Datebackup.append(Date)
                            Date=''
                            Flag=False
                        continue
                    
                    if postags[i]=='nt' and Flag == False:
                        Date = Date+words[i]
                        #获取一下head
                        head = arcs[i].head
                        if words[head-1]=="发生" or words[head-1]=="袭击":
                            Dt['date']=Date
                            break
                        if postags[i+1]!='nt':
                            Datebackup.append(Date)
                        #index=i
                        Flag = True 
                if Dt['date']=='' and len(Datebackup):
                    Dt['date']=Datebackup[-1]
            if Dt['date']!='' or Dt['address']!='':
                DataAndTime.append(Dt)
                
        if len(DataAndTime)>1:
            for i in DataAndTime:
                if i['date']=="当天":
                    DataAndTime.remove(i)
        if len(DataAndTime)==0:
            Dt['date']=''
            Dt['address']=''
            DataAndTime.append(Dt)
        
        return DataAndTime
            
            

    def extraction_start(self, input_txt,out_file_name):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        #in_file = open(in_file_name, 'r')
        out_file = open(out_file_name, 'a')
        
        line_index = 1
        sentence_number = 0
        text_line = input_txt
        while text_line:
            if line_index < begin_line:
                text_line = in_file.readline()
                line_index += 1
                continue
            if end_line != 0 and line_index > end_line:
                break
            sentence = text_line.strip()
            if sentence == "" or len(sentence) > 1000:
                text_line = in_file.readline()
                line_index += 1
                continue
            try:
              sentence_one = sentence.split(" ")#"。"
              for num in range(len(sentence_one)-1):
                  self.fact_triple_extract(sentence, out_file)
                  out_file.flush()
            except:
                pass
            sentence_number += 1
            if sentence_number % 50 == 0:
                print "%d done" % (sentence_number)
            text_line = in_file.readline()
            line_index += 1
        in_file.close()
        out_file.close()

    def attribute_define0(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            print "事件属性:","".join(words[index-i-1:index+1])
                            break

    def attribute_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            if(i != 0):
                                print "事件属性:","".join(words[index-i-1:index+1])
                            break

    def num_define(self,text):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        for index in range(len(words)):
            if(postags[index]=='m'):  
                return words[index]
                        
    def attribute_define2(self,text,keywords):
        words = self.segmentor.segment(text)
        #postags = postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'):
                            continue
                        else:
                            if(i != 0):
                                attribute = "".join(words[index-i:index+1])
                                #attribute = multiple_replace(attribute)
                                print '==========='
                                if attribute in '恐怖袭击事件':
                                    return
                                return attribute
                            else:
                                return


    def organization_define(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n')&(index-i-1 != 0):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])
    def organization_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n')&(index-i-1 != 0):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                #print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])

    def fact_attribute_from_text(self,text):
        """
        """
        text = text.replace(',','。')
        sentence_one = text.split("。")
        
        fact_attribute = []
        for num in range(len(sentence_one)-1):
            if('袭击' in sentence_one[num]):
                #attribute_define0(sentence_one[num],'事件')
                    #print sentence_one[num]
                sentence_temp = self.multiple_replace(sentence_one[num])
                if('发生' in sentence_temp)|('遭遇' in sentence_temp):
                    #print '---------------',sentence_temp
                    temp_atrribut1 = self.attribute_define2(sentence_temp,'事件')
                    #print temp_atrribut1
                    if((temp_atrribut1)==None):
                        temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击')
                        #print temp_atrribut2
                        if temp_atrribut2==None:
                            return
                        fact_attribute.append(str(temp_atrribut2))
                    else:
                        fact_attribute.append(str(temp_atrribut1))
        #print '------------------'
        if(len(fact_attribute)==0):
            #print '事件属性:unkown!'
            return 'None'
        else:
            #print '事件属性1:', len(fact_attribute),''.join(fact_attribute)
            #print '事件属性:',max(fact_attribute, key=len)
            return max(fact_attribute, key=len)

    def organization_from_text(self,text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        sentence_one = text.split("。")
        #print '---------------------------',sentence_one[0]   
        ogniz = []
        for num in range(len(sentence_one)-1):
            if('负责' in sentence_one[num]):
                if('宣称' in sentence_one[num]):
                    #print sentence_one[num]
                    sentence_temp = sentence_one[num].replace('“','')
                    sentence_temp = sentence_temp.replace('”','')
                    temp_org = self.organization_define(sentence_temp,'宣称')
                    if(temp_org != None):
                        ogniz.append(temp_org)
            if(len(ogniz)==0):
                if('宣称' in sentence_one[num]):
                    #print sentence_one[num]
                    sentence_temp = sentence_one[num].replace('“','')
                    sentence_temp = sentence_temp.replace('”','')
                    temp_org = self.organization_define1(sentence_temp,'宣称')
                    if(temp_org != None):
                        ogniz.append(temp_org)
        if(len(ogniz)==0):
            #print '组织:unkown!'
            return 'unknown'
        else:
            #print '组织:',max(ogniz, key=len)
            #print ogniz
            return max(ogniz, key=len)

    def death_num_from_text(self,text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        text = text.replace(',','。')
        text = text.replace('、','。')
        sentence_one = text.split("。")
        death_num = None
        hurt_num = None
        total_num = None
        #print '---------------------------',sentence_one[0]   
 
        for num in range(len(sentence_one)-1):
            if('死亡' in sentence_one[num])|('丧生' in sentence_one[num]):
                #print sentence_one[num]
                if(death_num == None):
                    death_num = self.num_define(sentence_one[num])
                    #print '死亡人数:',death_num
            if('受伤' in sentence_one[num]):
                #print sentence_one[num]        
                if(hurt_num == None):
                    hurt_num = self.num_define(sentence_one[num])
                    #print '受伤人数:',hurt_num
            if('伤亡' in sentence_one[num]):
                #print sentence_one[num]
                if(total_num == None):
                    total_num = self.num_define(sentence_one[num])
            #print type(death_num),type(hurt_num),type(total_num)
        return death_num,hurt_num,total_num
        


    def fact_triple_extract(self,sentence, out_file):
        #print sentence
        """
        对于给定的句子进行事实三元组抽取
        Args:
            sentence: 要处理的语句
        """
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)
        
        Entity_Address=[]
        Entity_Name = []
        
        for index in range(len(postags)):
            e1 = ''
            if netags[index][0] == 'S' or netags[index][0] == 'B':
                if 'Ns' in netags[index]:
                    ni = index
                    if netags[ni][0] == 'B':
                        while netags[ni][0] != 'E':
                            ni += 1
                        e1 = ''.join(words[index:ni+1])
                    else:
                        e1 = words[ni]
                    Entity_Address.append(e1)
                if "Nh" in netags[index]:
                    ni = index
                    if netags[ni][0]=='B':
                        while netags[ni][0]!='E':
                            ni+=1
                        e1= ''.join(words[index:ni+1])
                    else:
                        e1=words[ni]
                        Entity_Name .append(e1)
        Entity_Address = list(set(Entity_Address))
        Entity_Name = list(set(Entity_Name))
        for i in Entity_Name:
            print i
        AddressTp =[]
        LocateAddress = []
        for index in range(len(postags)):
            # 抽取以谓词为中心的事实三元组
            if postags[index] == 'v':
                child_dict = child_dict_list[index]
                # 主谓宾
                Flag = False
                if child_dict.has_key('SBV') and child_dict.has_key('VOB'):
                    e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    for address in Entity_Address:
                        if address in e1 and ( ("袭击" in e1 or "袭击" in e2) or ("事件" in e2 or "事件" in e1)):
                            for name in Entity_Name:
                                if name in e1:
                                    Flag == False
                                    break
                            else:
                                Flag = True
                            if Flag == True:
                                for i in Entity_Address:
                                    if i in e1 or i in e2:
                                        AddressTp.append(i)    
                    out_file.flush()
    
                # 定语后置,动宾关系
                if arcs[index].relation == 'ATT':
                    if child_dict.has_key('VOB'):
                        e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                        r = words[index]
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                        temp_string = r+e2
                        if temp_string == e1[:len(temp_string)]:
                            e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                            #print "定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)
                            out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                            out_file.flush()
                # 含有介宾关系的主谓动补关系
                if child_dict.has_key('SBV') and child_dict.has_key('CMP'):
                    #e1 = words[child_dict['SBV'][0]]
                    e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    cmp_index = child_dict['CMP'][0]
                    r = words[index] + words[cmp_index]
                    if child_dict_list[cmp_index].has_key('POB'):
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                        #print "介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        out_file.flush()
    
            # 尝试抽取命名实体有关的三元组
            if netags[index][0] == 'S' or netags[index][0] == 'B':
                ni = index
                if netags[ni][0] == 'B':
                    while netags[ni][0] != 'E':
                        ni += 1
                    e1 = ''.join(words[index:ni+1])
                else:
                    e1 = words[ni]
                if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                    r = self.complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                    if e1 in r:
                        r = r[(r.index(e1)+len(e1)):]
                    if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                        e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                        mi = arcs[arcs[ni].head-1].head-1
                        li = mi
                        if netags[mi][0] == 'B':
                            while netags[mi][0] != 'E':
                                mi += 1
                            e = ''.join(words[li+1:mi+1])
                            e2 += e
                        if r in e2:
                            e2 = e2[(e2.index(r)+len(r)):]
                        if r+e2 in sentence:
                            #print "人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2)
                            out_file.write("人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2))
                            out_file.flush()
                
        AddressTp = list(set(AddressTp))
        LocateAddress=AddressTp
        Tp = LocateAddress
        for i in LocateAddress:
            for k in AddressTp:
                if i!=k and (i in k):
                    Tp.remove(i)
            address = ''
            for i in Tp:
                address+=i
            print "地点:",address                    
                

    def build_parse_child_dict(self,words, postags, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if child_dict.has_key(arcs[arc_index].relation):
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            #if child_dict.has_key('SBV'):
            #    print words[index],child_dict['SBV']
            child_dict_list.append(child_dict)
        return child_dict_list
    
    def complete_e(self,words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        if child_dict.has_key('ATT'):
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
        
        postfix = ''
        if postags[word_index] == 'v':
            if child_dict.has_key('VOB'):
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if child_dict.has_key('SBV'):
                prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
    
        return prefix + words[word_index] + postfix

    def attribute_define0(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            print "事件属性:","".join(words[index-i-1:index+1])
                            break
    
    def attribute_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            if(i != 0):
                                print "事件属性:","".join(words[index-i-1:index+1])
                            break
    
    def attribute_define2(self,text,keywords):
        #print text
        words = self.segmentor.segment(text)
        print words
        #print self.segmentor
        #print '\t'.join(words)
        #postags = postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                #print words[index]
                if(words[index]==keywords):  
                    for i in range(index):
                        if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'):
                            continue
                        else:
                            if(i != 0):
                                attribute = "".join(words[index-i:index+1])
                                if attribute in '恐怖袭击事件':
                                    return
                                return attribute
                            else:
                                return
    
    
    def organization_define(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n'):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])
    
    
    
    def fact_attribute(self,in_file_name, out_file_name, begin_line, end_line):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        in_file = open(in_file_name, 'r')
        out_file = open(out_file_name, 'a')
        
        line_index = 1
        sentence_number = 0
        text_line = in_file.readline()
        while text_line:
            #小于起始段的直接跳过
            if line_index < begin_line:
                text_line = in_file.readline()
                line_index += 1
                continue
            if end_line != 0 and line_index > end_line:
                break
            sentence = text_line.strip()
            #长段(大于1000)直接跳过
            if sentence == "" or len(sentence) > 1000:
                text_line = in_file.readline()
                line_index += 1
                continue
            sentence_one = sentence.split(" ")#"。"
            
            for num in range(len(sentence_one)-1):
                attribute_define0(sentence_one[num],'事件')
                attribute_define2(sentence_one[num],'袭击')
            sentence_number += 1
            if sentence_number % 50 == 0:
                print "%d done" % (sentence_number)
            text_line = in_file.readline()
            line_index += 1
        in_file.close()
        out_file.close()
        '''
    
    def fact_attribute_from_text(text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        text = text.replace(',','。')
        sentence_one = text.split("。")
        
        fact_attribute = []
        for num in range(len(sentence_one)-1):
            if('袭击' in sentence_one[num]):
                #attribute_define0(sentence_one[num],'事件')
                    #print sentence_one[num]
                sentence_temp = multiple_replace(sentence_one[num])
                if('发生' in sentence_temp)|('遭遇' in sentence_temp):
                    print '---------------',sentence_temp
                    temp_atrribut1 = self.attribute_define2(sentence_temp,'事件')
                    fact_attribute.append(str(temp_atrribut1))
                    if((temp_atrribut1)==None):
                        temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击')
                        fact_attribute.append(str(temp_atrribut2))
        print '------------------'
        if(len(fact_attribute)==0):
            print '事件属性:unkown!'
            return 'unknown'
        else:
            print '事件属性1:', len(fact_attribute),fact_attribute
            print '事件属性:',max(fact_attribute, key=len)
            return max(fact_attribute, key=len)
            '''
    '''
class semantic_annotation:
    LTP_DATA_DIR = 'D:/LTP/ltp_data'
    ATT_ADV = ['ATT', 'ADV']
    N = ['a', 'd', 'b']
    dp_arcs = ['VOB', 'SBV', 'FOB']

    def __init__(self):
        cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model')
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(cws_model_path, config.dic_path)
        pos_model_path = os.path.join(config.LTP_DATA_DIR, 'pos.model')
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        par_model_path = os.path.join(config.LTP_DATA_DIR,
                                      'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)

    def Model_release(self):
        try:
            self.segmentor.release()
            self.postagger.release()
            self.parser.release()
        except Exception as e:
            s = "释放分词,词性标注,句法分析模型运行发生异常Model_releasen" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def set_sentence(self, sentence):
        try:
            words = self.segmentor.segment(sentence)
            words_list = list(words)
            Logger.log_DEBUG.debug('分词结果:' + str(words_list))
            postags = self.postagger.postag(words)
            postags_list = list(postags)
            Logger.log_DEBUG.debug('词性标注结果:' + str(postags_list))

            arcs = self.parser.parse(words, postags)
            arcs_list = list(arcs)
            s = '句法分析结果:'
            for a in arcs_list:
                s = s + str(a.head) + ":" + a.relation + '  '
            Logger.log_DEBUG.debug(s)

            sen = sentence_class(words_list, postags_list, arcs_list)

            return sen
        except Exception as e:
            s = "设置句子属性发生异常set_sentence" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def semantic_annotation(self, sentence):
        try:

            sen = self.set_sentence(sentence)

            dic_r = {}

            ap_ID = self.find_ap(sen)
            if ap_ID >= 0:
                dic_r['ap'] = sen.class_word_list[ap_ID].name
                Logger.log_DEBUG.debug('动作属性:' + sen.class_word_list[ap_ID].name)
            else:
                dic_r['ap'] = ''
                Logger.log_DEBUG.debug('没有找到动作属性:')

            indiv_ID = self.find_indiv(sen)
            if indiv_ID >= 0:
                dic_r['indiv'] = sen.class_word_list[indiv_ID].name
                Logger.log_DEBUG.debug('个体:' + sen.class_word_list[indiv_ID].name)
            else:
                dic_r['indiv'] = ''
                Logger.log_DEBUG.debug('没有找到个体:')

            adv = self.find_AdvAdj(sen, ap_ID, 'adv')
            dic_r['adv'] = adv
            Logger.log_DEBUG.debug('状语:' + adv)

            dp_ID = self.find_dp(sen)
            if dp_ID >= 0:
                dic_r['dp'] = sen.class_word_list[dp_ID].name
                Logger.log_DEBUG.debug('数据属性:' + sen.class_word_list[dp_ID].name)
            else:
                dic_r['dp'] = ''
                Logger.log_DEBUG.debug('没有找到数据属性:')

            adj = self.find_AdvAdj(sen, dp_ID, 'adj')
            dic_r['adj'] = adj
            Logger.log_DEBUG.debug('定语:' + adj)

            other = ''
            for w in sen.class_word_list:
                if w.Semantic_markup == 'other':
                    other = other + ',' + w.name
            dic_r['other'] = other
            Logger.log_DEBUG.debug('其他词:' + other)
            return dic_r

        except Exception as e:
            s = "语义标注主函数运行发生异常semantic_annotation" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def find_dp(self, sen):
        try:
            self.word_merge_A(sen)
            self.word_merge_COO(sen)
            self.word_merge_VOB(sen)
            cwl = sen.class_word_list
            dp = []
            for i in range(len(cwl)):
                if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \
                        cwl[i].arcs_relation in semantic_annotation.dp_arcs and cwl[i].arcs_head == sen.ap_ID + 1:
                    dp.append(i)
            if len(dp) <= 0:
                for i in range(len(cwl)):
                    if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \
                            cwl[i].pos == 'n':
                        dp.append(i)
            if len(dp) > 0:
                sen.dp_ID = dp[0]
                cwl[dp[0]].Semantic_markup = 'dp'

            return sen.dp_ID
        except Exception as e:
            s = "确定数属发生异常find_dp" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def word_merge_VOB(self, sen):
        try:
            cwl = sen.class_word_list

            VOB_num = 0
            for i in range(len(cwl)):
                if cwl[i].Semantic_markup == 'other' and \
                        cwl[i].arcs_relation == 'VOB' and cwl[i].arcs_head == sen.hed_ID + 1:
                    VOB_num = VOB_num + 1

            for n in range(VOB_num):
                for i in range(len(cwl)):
                    if cwl[i].Semantic_markup == 'other' and \
                            cwl[i].arcs_relation == 'VOB' and cwl[i].arcs_head == sen.hed_ID + 1:

                        ioc = i + 1
                        for j in range(i, len(cwl)):
                            if cwl[j].Semantic_markup == 'other' and \
                                    cwl[j].arcs_relation == 'VOB' and cwl[j].arcs_head == sen.hed_ID + 1:
                                if self.is_merge(sen, ioc, j + 1):
                                    Logger.log_DEBUG.debug('合并: ' + cwl[j].name + ' 和 ' + cwl[ioc - 1].name)
                                    cwl[ioc - 1].name = cwl[ioc - 1].name + cwl[j].name
                                    cwl[ioc - 1].arcs_relation = 'VOB'
                                    cwl[j].Semantic_markup = 'merge'

        except Exception as e:
            s = "合并核心词宾语发生异常word_merge_VOB" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def word_merge_COO(self, sen):

        try:
            cwl = sen.class_word_list

            COO_num = 0
            for i in range(len(cwl)):
                if (cwl[i].Semantic_markup) == 'other' and (cwl[i].pos) not in semantic_annotation.N and \
                        cwl[i].arcs_relation == 'COO' and cwl[i].arcs_head == sen.ap_ID + 1:
                    COO_num = COO_num + 1

            for n in range(COO_num):
                for i in range(len(cwl)):
                    if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \
                            cwl[i].arcs_relation == 'COO' and cwl[i].arcs_head == sen.ap_ID + 1:

                        ioc = i + 1
                        for j in range(len(cwl)):
                            if cwl[j].Semantic_markup == 'other' and cwl[j].arcs_head == ioc and \
                                    cwl[j].arcs_relation == 'VOB':
                                if self.is_merge(sen, ioc, j + 1):
                                    Logger.log_DEBUG.debug('合并: ' + cwl[j].name + ' 和 ' + cwl[ioc - 1].name)
                                    if ioc > j + 1:
                                        cwl[ioc - 1].name = cwl[j].name + cwl[ioc - 1].name
                                    else:
                                        cwl[ioc - 1].name = cwl[ioc - 1].name + cwl[j].name
                                    cwl[ioc - 1].arcs_relation = 'VOB'
                                    cwl[j].Semantic_markup = 'merge'

        except Exception as e:
            s = "合并与动属并列的动词与其宾语发生异常word_merge_COO" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def word_merge_A(self, sen):
        try:
            cwl = sen.class_word_list
            A_num = 0
            for i in range(len(cwl)):
                if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \
                        cwl[i].arcs_relation in semantic_annotation.ATT_ADV:
                    A_num = A_num + 1

            for n in range(A_num):
                for i in range(len(cwl)):
                    if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \
                            cwl[i].arcs_relation in semantic_annotation.ATT_ADV:
                        Ioc = cwl[i].arcs_head
                        if cwl[Ioc - 1].Semantic_markup == 'other' and self.is_merge(sen, i + 1, Ioc):
                            s = '合并: ' + cwl[i].name + ' 和 ' + cwl[Ioc - 1].name
                            Logger.log_DEBUG.debug(s)
                            if i + 1 > Ioc:
                                cwl[Ioc - 1].name = cwl[Ioc - 1].name + cwl[i].name
                            else:
                                cwl[Ioc - 1].name = cwl[i].name + cwl[Ioc - 1].name

                            cwl[i].Semantic_markup = 'merge'
                            break
        except Exception as e:
            s = "合并定中和状中关系发生异常word_merge_A" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def is_merge(self, sen, A, Ioc):
        try:
            markup_List = ['merge']
            if abs(A - Ioc) == 1:
                return True
            if A > Ioc:
                for i in range(Ioc - 1, A):
                    if sen.class_word_list[i].Semantic_markup in markup_List:
                        return True
            else:
                for i in range(A - 1, Ioc):
                    if sen.class_word_list[i].Semantic_markup in markup_List:
                        return True
            return False
        except Exception as e:
            s = "判断两个词是否可以合并发生异常is_merge" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def find_AdvAdj(self, sen, ID, s_markup):
        try:
            advAdj = ''
            if ID < 0:
                return advAdj

            cwl = sen.class_word_list

            for i in range(len(cwl)):
                if cwl[i].arcs_head == ID + 1 and cwl[i].pos in semantic_annotation.N and \
                        cwl[i].Semantic_markup == 'other':
                    advAdj = advAdj + cwl[i].name + ","
                    cwl[i].Semantic_markup = s_markup
                if s_markup == 'adj':
                    if cwl[i].pos in semantic_annotation.N and \
                            cwl[i].Semantic_markup == 'other' and cwl[i].name not in advAdj:
                        advAdj = advAdj + cwl[i].name + ","
                        cwl[i].Semantic_markup = s_markup
            return advAdj
        except Exception as e:
            s = "确定状语和定语发生异常find_AdvAdj" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def find_indiv(self, sen):
        try:
            if len(sen.class_word_list) <= 3:
                for i in range(len(sen.class_word_list)):
                    if sen.class_word_list[i].Semantic_markup == 'other':
                        sen.class_word_list[i].Semantic_markup = 'indiv'
                        sen.indiv_ID = i
                        return sen.indiv_ID

            flag_1 = -1
            flag_2 = -1
            for i in range(len(sen.class_word_list)):
                if sen.class_word_list[i].Semantic_markup == 'other':
                    if flag_1 < 0:
                        flag_1 = i
                        continue
                    elif flag_2 < 0:
                        flag_2 = i
                        break
            if flag_2 - flag_1 != 1:
                sen.class_word_list[flag_1].Semantic_markup = 'indiv'
                sen.indiv_ID = flag_1
                return sen.indiv_ID

            wc0 = sen.class_word_list[flag_1]
            wc1 = sen.class_word_list[flag_2]

            if wc0.arcs_head == flag_2 + 1 and wc0.arcs_relation in semantic_annotation.ATT_ADV:
                if wc1.Semantic_markup == 'other' and wc1.pos == 'n':
                    wc1.name = wc0.name + wc1.name
                    wc1.Semantic_markup = 'indiv'
                    wc0.Semantic_markup = 'indiv_ATT'
                    sen.indiv_ID = flag_2
                else:
                    wc0.Semantic_markup = 'indiv'
                    sen.indiv_ID = flag_1
            else:
                wc0.Semantic_markup = 'indiv'
                sen.indiv_ID = flag_1
            return sen.indiv_ID
        except Exception as e:
            s = "确定个体发生异常find_indiv" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def find_ap(self, sen):

        try:
            cwl = sen.class_word_list
            for i in range(len(cwl)):
                if cwl[i].pos == 'v':
                    sen.V_ID = i
                    break
            if sen.V_ID < 0:
                Logger.log_DEBUG.debug('该句子分词经过词性标注后没有动词')
                sen.ap_ID = -1
                return sen.ap_ID
            for i in range(len(cwl)):
                if cwl[i].arcs_relation == 'HED':
                    sen.hed_ID = i
                    break
            if sen.hed_ID < 0:
                Logger.log_DEBUG.debug('经过句法分析没有核心词')
                sen.ap_ID = sen.V_ID
                return sen.ap_ID

            if sen.V_ID == sen.hed_ID:
                sen.ap_ID = sen.hed_ID
                cwl[sen.ap_ID].Semantic_markup = 'ap'
                return sen.ap_ID

            if cwl[sen.hed_ID].pos != 'v':
                new_hed = self.find_late(sen.hed_ID, sen.postags_list)
                if new_hed == sen.V_ID:
                    sen.ap_ID = sen.V_ID
                else:
                    if cwl[new_hed].arcs_head == sen.V_ID or cwl[sen.V_ID].arcs_head == new_hed:
                        sen.ap_ID = sen.V_ID
                    else:
                        sen.ap_ID = new_hed
            else:
                if cwl[sen.V_ID].arcs_head == sen.hed_ID + 1:
                    sen.ap_ID = sen.V_ID
                else:
                    sen.ap_ID = sen.hed_ID

            cwl[sen.ap_ID].Semantic_markup = 'ap'
            cwl[sen.hed_ID].arcs_head = cwl[sen.ap_ID].arcs_head
            cwl[sen.hed_ID].arcs_relation = cwl[sen.ap_ID].arcs_relation
            return sen.ap_ID
        except Exception as e:
            s = "确定动作属性发生异常find_ap" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)

    def find_late(self, flag_hed, postags_list):
        try:
            flag_L = -1
            flag_R = -1
            for i in range(0, flag_hed):
                if postags_list[i] == 'v':
                    flag_L = i
            for i in range(flag_hed, len(postags_list)):
                if postags_list[i] == 'v':
                    flag_R = i
            if flag_L > 0 and flag_R > 0:
                if abs(flag_L - flag_hed) <= abs(flag_R - flag_hed):
                    return flag_L
                else:
                    return flag_R
            else:
                if flag_L > 0:
                    return flag_L
                else:
                    return flag_R

        except Exception as e:
            s = "找最近动词发生异常find_late" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)
示例#22
0
class Semantic_Parser(object):
    def __init__(self):
        self.cws_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/cws.model'
        self.pos_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/pos.model'
        self.parser_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/parser.model'
        self.ner_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/ner.model'
        self.srl_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/srl/'

    def load(self):
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)

        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)

        self.parser = Parser()
        self.parser.load(self.parser_model_path)

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)

        self.labeller = SementicRoleLabeller()
        self.labeller.load(self.srl_model_path)

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_cws(self, sentence):
        try:
            cws = self.segmentor.segment(sentence)
        except:
            cws = self.segmentor.segment(sentence.decode('utf8'))
        print(" ".join(cws))
        return cws

    def get_pos(self, cws):
        postags = self.postagger.postag(cws)
        print(" ".join(postags))
        return postags

    def get_arcs(self, cws, postags):
        arcs = self.parser.parse(cws, postags)
        label = " ".join("%s:%d:%s" % (word, arc.head, arc.relation)
                         for word, arc in zip(cws, arcs))
        print(label)
        return arcs

    def get_role(self, cws, postags, arcs):
        netags = self.recognizer.recognize(cws, postags)
        roles = self.labeller.label(cws, postags, netags, arcs)
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))

    def get_query(self, cws, arcs):
        '''
        对问句做句法分析后,提取其中的主干部分
        先取HED,然后分别取SBV和VOB

        :param cws:
        :param arcs:
        :return:
        '''
        words = [word for word in cws]
        head = [arc.head for arc in arcs]
        relation = [arc.relation for arc in arcs]
        print(words)
        print(head)
        print(relation)
        hed_index = index(head, 0)[0] + 1
        import_index = index(head, hed_index)
        print(import_index)
        sbv = [words[i] for i in import_index if relation[i] == 'SBV']
        vob = [words[i] for i in import_index if relation[i] == 'VOB']
        print(''.join(sbv))
        print(''.join(vob))
        return ''.join(sbv), ''.join(vob)
示例#23
0
def extract_opinion(document):
    saywords = load_saywords()

    LTP_DATA_DIR = r'../ltp_data/'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型

    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    # cut to sentences
    sentences = SentenceSplitter.split(document)
    table = []
    for sentence in sentences:
        # cut words
        words = segmentor.segment(sentence)  # 分词
        # postages
        postags = postagger.postag(words)  # 词性标注
        # ner
        netags = recognizer.recognize(words, postags)  # 命名实体识别
        # dependency parsing
        arcs = parser.parse(words, postags)  # 句法分析

        child_dict_list = build_parse_child_dict(words, postags, arcs)
        index = 0
        for arc in arcs:
            if arc.relation == 'HED':
                break
            index += 1

        # 谓语是说一类的词
        predicate = words[index]
        child_dict = child_dict_list[index]
        if ('SBV' in child_dict) and ('VOB' in child_dict):
            e1 = complete_e(words, postags, child_dict_list,
                            child_dict['SBV'][0])
            r = words[index]
            e2 = ''.join(words[index + 1:])
            if r in saywords:
                table.append((e1, r, e2))

    segmentor.release()  # 释放模型
    postagger.release()  # 释放模型
    recognizer.release()  # 释放模型
    parser.release()  # 释放模型

    return table
示例#24
0
文件: main.py 项目: cwscc/NLP
def parser_ltp_arc(word_list, tag_list):
    parser = Parser()  #初始化实例
    parser.load(par_model_path)
    arcs = parser.parse(word_list, tag_list)  #依存句法分析,一条评论的依存句法分析
    parser.release()
    return arcs
示例#25
0
class Extraction():
    def __init__(self, cws_model_path: str, pos_model_path: str,
                 ner_model_path: str, parser_model_path: str,
                 spoken_words: str, word2vec):
        self.spoken_words = spoken_words
        self.truncate_index = 8

        self.segmentor = Segmentor()
        self.postagger = Postagger()
        self.ner = NamedEntityRecognizer()
        self.parser = Parser()

        self.segmentor.load(str(cws_model_path))
        self.postagger.load(str(pos_model_path))
        self.ner.load(str(ner_model_path))
        self.parser.load(str(parser_model_path))
        self.word2vec = Word2Vec.load(word2vec)

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.ner.release()
        self.parser.release()

    def get_next_sentence(self, news, index):
        stop1 = news[index + 1:].find("。")
        stop2 = news[index + 1:].find("!")
        stop3 = news[index + 1:].find("?")
        stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1]
        if stop_list is None:
            False

        stop = min(stop_list)
        return news[index:index + stop + 2], index + stop + 2

    def cut(self, string):
        return " ".join(jieba.cut(string))

    def sentence_distance(self, sentence1, sentence2):
        word_list1 = self.cut(sentence1)
        word_list2 = self.cut(sentence2)

        vec_1 = 0
        vec_2 = 0
        ### get representation of sentence 1
        for i in range(len(word_list1)):
            if word_list1[i] in self.word2vec.wv.vocab:
                vec_1 += self.word2vec.wv[word_list1[i]]

        ### get representation of sentence 2
        for i in range(len(word_list2)):
            if word_list2[i] in self.word2vec.wv.vocab:
                vec_2 += self.word2vec.wv[word_list2[i]]

        return np.dot(vec_1,
                      vec_2) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))

    def get_sentence(self, news, word_list, idx, postag_list, sub_id):
        # 取得 说的内容 及SBV的宾语成分
        # idx 为表示说的词在新闻中的位置信息
        index = len("".join(word_list[:idx + 1]))
        sub_index = len("".join(word_list[:sub_id]))

        if news[index] == "。" or news[index] == "!" or news[index] == "?":
            stop1 = news[:index].rfind("。")
            stop2 = news[:index].rfind("!")
            stop3 = news[:index].rfind("?")
            # 检查是不是后面没有句子了
            stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1]
            if len(stop_list) == 0:
                stop = 0
            else:
                stop = max(stop_list) + 1

            begin = float("inf")
            end = float("inf")
            if "“" in news[:index] and "”" in news[:index]:
                begin = news[:index].rfind("“")
                end = news[:index].rfind("”")
            # 第一种情况 双引号在stop前面,表示说词后面跟的是双引号的句子 则双引号里的内容即为说的内容
            if sub_index - end < self.truncate_index:
                result = news[begin + 1:end]
            else:
                result = news[stop:sub_index]
            # print(result)
        else:
            stop1 = news[index:].find("。")
            stop2 = news[index:].find("!")
            stop3 = news[index:].find("?")
            # 检查是不是后面没有句子了
            stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1]
            if len(stop_list) == 0:
                return False
            # 返回后面的第一个句子
            stop = min(stop_list)
            sentence = news[index:stop + index + 1]
            if postag_list[idx + 1] == 'wp':
                sentence = sentence[1:]
            if postag_list[idx + 2] == 'wp':
                sentence = sentence[1:]
            result = sentence
            sim = 1
            next_id = stop + index + 2
            # 检查第二个句子是否也是说的内容,通过检查句子相似性来判断 若是相似度大于某个数值则表示 相近 这个句子也是说的内容
            # 如果相似度大于0.7表示 该句话和前一句内容相似 所以这句话 也为说的内容 继续检查下一句话
            while sim > 0.85 and next_id <= len(news):
                next_sentence_id = next_id
                if next_sentence_id <= len(news):
                    next_sentence, next_id = self.get_next_sentence(
                        news, next_sentence_id)
                    sim = self.sentence_distance(sentence, next_sentence)
                if sim > 0.85:
                    result += next_sentence
                    sentence = next_sentence
        return result

    def get_sub_and_view(self, idxs, news, word_list, postag_list):
        sub = []
        speech = []
        for sub_id, spoken_id in idxs:
            sub.append(word_list[sub_id])
            speech.append(
                self.get_sentence(news, word_list, spoken_id, postag_list,
                                  sub_id))
        return sub, speech

    def find_spoken_word_id_and_sub(self, spoken_words, word_list, ner_list,
                                    parser_list):
        #取得 新闻中 包含SBV关系 并且V表示的是说的意思 的主语和谓语的位置
        id_list = []
        for sub_id, parse_relation in enumerate(parser_list):
            index, relation = parse_relation
            if relation == "SBV" and (ner_list[sub_id] == "S-Nh"
                                      or ner_list[sub_id] == "S-Ni"
                                      or ner_list[sub_id] == "S-Ns"):
                spoken_word = word_list[index - 1]
                if spoken_word in spoken_words:
                    word_id = index - 1
                    id_list.append((sub_id, word_id))
        return id_list

    def newsExtraction(self, news):
        word_list = list(self.segmentor.segment(news))
        postag_list = list(self.postagger.postag(word_list))
        ner_list = list(self.ner.recognize(word_list, postag_list))
        arcs = self.parser.parse(word_list, postag_list)
        parser_list = [(arc.head, arc.relation) for arc in arcs]

        idx = self.find_spoken_word_id_and_sub(self.spoken_words, word_list,
                                               ner_list, parser_list)
        # print(idx)
        sub, speech = self.get_sub_and_view(idx, news, word_list, postag_list)
        # for i in range(len(sub)):
        #     print(sub[i], speech[i])
        return sub, speech
示例#26
0
class Senten_Parse:

	def __init__(self, flag=True):
		self.flag = flag  # 是否使用自定义词典,True:使用,False:不使用,默认为True
		self.segmentor = Segmentor()  # 分词模型初始化
		self.postagger = Postagger()  # 词性标注模型初始化
		self.parser = Parser()  # 句法分析模型初始化

		if self.flag:
			if os.path.exists(config_TM.userdict_path):
				# 加载分词模型,使用自定义词典
				self.segmentor.load_with_lexicon(config_TM.cws_model_path, config_TM.userdict_path)
				Logger.log_DEBUG.info('分词模型加载成功,使用自定义词典')
			else:
				Logger.log_ERROR.error('没找到自定义词典文件,请检查路径是否正确')
		else:
			# 加载分词模型,不使用自定义词典
			self.segmentor.load(config_TM.cws_model_path)
			Logger.log_DEBUG.info('分词模型加载成功,不使用自定义词典')

		# 加载词性标注模型
		self.postagger.load(config_TM.pos_model_path)
		Logger.log_DEBUG.info('词性标注模型加载成功')

		# 加载句法分析模型
		self.parser.load(config_TM.par_model_path)
		Logger.log_DEBUG.info('句法分析模型加载成功')

	def sentence_parse(self, sentence):
		"""
		语句分析
		:param sentence: 待处理语句
		:return: 返回分词结果(list)、词性标注结果(list)、句法分析结果(list-tuple)
		"""
		segmentor = self.segmentor
		postagger = self.postagger
		parser = self.parser

		# 分词结果列表
		words_list = list(segmentor.segment(sentence))

		# 词性标注
		postags = postagger.postag(words_list)
		# 词性标注结果列表
		pos_list = [pos for word, pos in zip(words_list, postags)]

		# 句法分析
		arcs = parser.parse(words_list, postags)
		# 句法分析结果列表
		arcs_list = []

		temp_arcs_list = [(arc.head, arc.relation) for arc in arcs]
		arcslist_dic = dict((i, c) for i, c in enumerate(temp_arcs_list))

		words_dic = dict((i, c) for i, c in enumerate(words_list))

		for key in arcslist_dic:
			arcslist_dic_key = arcslist_dic[key]
			if arcslist_dic_key[1] == 'HED':
				temp_list = [words_dic[key], arcslist_dic_key[1], words_dic[key]]
				arcs_list.append(temp_list)
			else:
				temp_list = [words_dic[key], arcslist_dic_key[1], words_dic[arcslist_dic_key[0] - 1]]
				arcs_list.append(temp_list)

		Logger.log_DEBUG.info('语句分析完成!')

		return words_list, pos_list, arcs_list

	def __del__(self):
		"""
		释放模型
		:return:
		"""
		self.segmentor.release() # 分词模型释放
		self.postagger.release() # 词性标注模型释放
		self.parser.release() # 句法分析模型释放
		print('-------')
		print('模型释放完成')
示例#27
0
    if postags[i] in exceptposttag:
        continue
    print(words[i])
print('\t'.join(postags))
postagger.release()  # 释放模型

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer.load('/home/curtank/Documents/ltp_data/ner.model')  # 加载模型
netags = recognizer.recognize(words, postags)  # 命名实体识别
print('\t'.join(netags))
recognizer.release()  # 释放模型

from pyltp import Parser
parser = Parser()
parser.load('/home/curtank/Documents/ltp_data/parser.model')
arcs = parser.parse(words, postags)  # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
parser.release()  # 释放模型

from pyltp import SementicRoleLabeller
labeller = SementicRoleLabeller()  # 初始化实例
labeller.load('/home/curtank/Documents/ltp_data/srl')  # 加载模型
roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
for role in roles:
    print(
        role.index, "  ".join([
            "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
            for arg in role.arguments
        ]))
labeller.release()  # 释放模型
示例#28
0
class ExtractViews(object):
    def __init__(self):
        self.sents = self.load()
        self.sent_embd = SentEmbedding()
        self.segmentor = Segmentor()
        self.postagger = Postagger()
        self.parser = Parser()

    def load(self, inpath='./data/news_content.json'):
        data = read_json(inpath)
        all_sents = []
        for news in data:
            for frag in news:
                sents = frag['sents']
                all_sents.append(sents)
        print('finished loading all sentences')
        return all_sents

    def prepare_data(self, sampled_sents=1000):
        print('start preparing items for sentences embedding')
        sentences_sampled = random.sample(self.sents, sampled_sents)
        self.sent_embd.prepare(sentences_sampled)
        print('sentence embedding data prepare finished')

    def prepare_nlp_parser(self):
        self.segmentor.load(r'./ltpmodels/cws.model')
        self.postagger.load(r'./ltpmodels/pos.model')
        self.parser.load(r'./ltpmodels/parser.model')

    def extract_news(self, content):
        content = content.strip()
        paras = content.split('\n')
        sentences = []
        for para in paras:
            sentences.append(cut_sent(para))
        views = self._extract_views(sentences)
        return views

    def _extract_views(self, all_sents):
        nums = len(all_sents)
        views_in_sents = []
        print('totally {} paragraphs needing processed'.format(nums))
        for i, sents in enumerate(all_sents):
            views_tmp = []
            if i % 100 == 0:
                print('processing paras : {}/{}'.format(i, nums))
            for j, sent in enumerate(sents):
                sent = sent.replace('\\n', '\n').strip()
                # sentence长度达到1000左右时,ltp会报错
                if len(sent) == 0 or len(sent) > 500:
                    continue
                # words = list(jieba.cut(sent))
                words = list(self.segmentor.segment(sent))
                contains = contain_candidates(words)
                if len(contains) == 0:
                    continue
                tags = list(self.postagger.postag(words))
                arcs = list(self.parser.parse(words, tags))
                sbv, head = get_sbv_head(arcs, words, tags)
                if sbv[0] is None or head[0] is None or head[0] not in contains:
                    continue
                subj = sbv[0]
                view = clean_view(words[head[1] + 1:])
                views_tmp.append((subj, view, j))
            views_final = self._get_final_views(sents, views_tmp)
            if len(views_final) > 0:
                views_in_sents.extend(views_final)
        return views_in_sents

    def extract(self):
        all_sents = self.sents
        views_in_sents = self._extract_views(all_sents)
        return views_in_sents

    def _get_final_views(self, sents, views_tmp):
        def _entire_emb(emb: np.array, sents_no_ind):
            dim = emb.shape[1]
            for ind in sents_no_ind:
                # 不存在embedding的句子补0
                emb = np.insert(emb, ind, np.zeros((1, dim)), axis=0)
            return emb

        embeddings, sents_no_ind = self.sent_embd.sents_embedding(sents)
        # 获得所有句子的embeding
        embeddings = _entire_emb(embeddings, sents_no_ind)
        views_final = []
        for i, view in enumerate(views_tmp):
            start = view[2]
            stop = len(views_tmp)
            if i < len(views_tmp) - 1:
                stop = views_tmp[i + 1][2]
            end = self._get_view_end(embeddings, start, stop)
            views_final.append({
                'subj': view[0],
                'view': view[1] + ''.join(sents[start + 1:end])
            })
        return views_final

    def _get_view_end(self, embeddings, start, stop, sim_threshold=0.8):
        # 判断view是不是在尾句或view不存在embedding
        if start + 1 >= stop or np.sum(np.abs(embeddings[start])) == 0:
            return start
        end = start + 1
        for i in range(start + 1, stop):
            sent_emb = embeddings[i]
            curr_emb = np.mean(embeddings[start:i])
            sim = self.sent_embd.cos_similarity(curr_emb, sent_emb)
            if sim < sim_threshold:
                break
            end += 1
        return end

    def release_nlp_parser(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()

    def run(self):
        self.prepare_data()
        self.prepare_nlp_parser()
        views = self.extract()
        self.release_nlp_parser()
        write_json('./data/news_views_final.json', views)
        print('finished extract views')
class myLTP(object):
    def __init__(self):

        LTP_DATA_DIR = 'ltp_data_v3.4.0'  # ltp模型目录的路径
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        ner_model_path = os.path.join(LTP_DATA_DIR,
                                      'ner.model')  # 分词模型路径,模型名称为`cws.model`
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')

        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型
        self.parser = Parser()  # 初始化实例
        self.parser.load(par_model_path)  # 加载
        #get postags

    def segment(self, text):
        words = list(self.segmentor.segment(text))
        #print(words)
        return words

    def postag(self, words):
        postags = list(self.postagger.postag(words))
        return postags

    def arcs(self, words, postags):
        arcs = self.parser.parse(words, postags)  # 句法分析
        arcs = list(self.parser.parse(words, postags))  #print("HEAD:",head)
        return arcs

    #get time
    def extract_info(self, text):
        words = self.segment(text)
        postags = self.postag(words)
        arcs = self.arcs(words, postags)

        #get time
        time_lst = []
        i = 0
        for tag, word in zip(postags, words):
            if tag == 'nt':
                j = i
                while postags[j] == 'nt' or words[j] in ['至', '到']:
                    j += 1
                time_lst.append(''.join(words[i:j]))
            i += 1

        # 去重子字符串的情形
        remove_lst = []
        for i in time_lst:
            for j in time_lst:
                if i != j and i in j:
                    remove_lst.append(i)

        text_time_lst = []
        for item in time_lst:
            if item not in remove_lst:
                text_time_lst.append(item)

        #get entity
        netags = list(self.recognizer.recognize(words, postags))  # 命名实体识别

        entity_index = [i for i in range(len(netags)) if netags[i] != 'O']
        print(entity_index)
        entity_words = []

        #words = [x for (index,x) in enumerate(words) if index in entity_index]
        #print(words)
        #merge words
        i = 0
        tags = []
        while i < len(entity_index):

            if netags[entity_index[i]][0] == "B":
                tags.append(netags[entity_index[i]][2:])
                begin = entity_index[i]
                end = entity_index[i]
                for j in range(begin + 1, len(netags)):
                    i = i + 1
                    if netags[j][0] == "E":
                        end = j
                        break
                new_word = "".join(words[begin:end + 1])
                entity_words.append(new_word)

            else:
                entity_words.append(words[entity_index[i]])
                tags.append(netags[entity_index[i]][2:])
            i += 1

        print(entity_words)
        #print(text_time_lst)
        # print("\n\n\n")
        relation_lst = []
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)
        for index in range(len(postags)):
            # 抽取以谓词为中心的事实三元组
            if postags[index] == 'v':
                child_dict = child_dict_list[index]
                # 主谓宾
                if 'SBV' in child_dict and 'VOB' in child_dict:
                    e1 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0])
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['VOB'][0])
                    if e1 not in entity_words:
                        entity_words.append(e1)
                        tags.append("Nr")  #node regular
                    if r not in entity_words:
                        entity_words.append(r)
                        tags.append("Nv")  #node verb

                    if e2 not in entity_words:
                        entity_words.append(e2)
                        tags.append("Nr")
                    relation_lst.append(Relation(e1, r, "SBV"))  # subject-verb
                    relation_lst.append(Relation(r, e2, "VOB"))  # verb-object

                    print("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    #out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    #out_file.flush()
                # 定语后置,动宾关系
                if arcs[index].relation == 'ATT':
                    if 'VOB' in child_dict:
                        e1 = self.complete_e(words, postags, child_dict_list,
                                             arcs[index].head - 1)
                        r = words[index]
                        e2 = self.complete_e(words, postags, child_dict_list,
                                             child_dict['VOB'][0])
                        temp_string = r + e2
                        if temp_string == e1[:len(temp_string)]:
                            e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                            if e1 not in entity_words:
                                entity_words.append(e1)
                                tags.append("Nr")  #node regular
                            if r not in entity_words:
                                entity_words.append(r)
                                tags.append("Nv")  #node verb
                            if e2 not in entity_words:
                                entity_words.append(e2)
                                tags.append("Nr")
                            relation_lst.append(Relation(r, e2, "VOB"))
                            relation_lst.append(Relation(e1, r, "ATT"))

                            print("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                            # out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                            #out_file.flush()
                # 含有介宾关系的主谓动补关系
                if 'SBV' in child_dict and 'CMP' in child_dict:
                    #e1 = words[child_dict['SBV'][0]]
                    e1 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0])
                    cmp_index = child_dict['CMP'][0]
                    r = words[index] + words[cmp_index]
                    if 'POB' in child_dict_list[cmp_index]:
                        e2 = self.complete_e(
                            words, postags, child_dict_list,
                            child_dict_list[cmp_index]['POB'][0])
                        if e1 not in entity_words:
                            entity_words.append(e1)
                            tags.append("Nr")  #node regular
                        if r not in entity_words:
                            entity_words.append(r)
                            tags.append("Nv")  #node verb
                        if e2 not in entity_words:
                            entity_words.append(e2)
                            tags.append("Nr")
                        relation_lst.append(Relation(
                            e1, r, "SBV"))  # Subject Verb Complement
                        relation_lst.append(Relation(r, e2, "CMP"))
                        print("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        #out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        #out_file.flush()

        return (entity_words, tags, relation_lst, text_time_lst)

    def build_parse_child_dict(self, words, postags, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        #print(list(words), len(words))
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:

                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            #if child_dict.has_key('SBV'):
            #    print words[index],child_dict['SBV']
            child_dict_list.append(child_dict)
        #print(child_dict_list)
        return child_dict_list

    def complete_e(self, words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        if 'ATT' in child_dict:
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list,
                                          child_dict['ATT'][i])

        postfix = ''
        if postags[word_index] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list,
                                           child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0]) + prefix

        return prefix + words[word_index] + postfix

    def free_ltp(self):
        self.postagger.release()
        self.recognizer.release()  # 释放模型
        self.segmentor.release()
        self.parser.release()
示例#30
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag = self.segmentor.load_with_lexicon(
            os.path.join(default_model_dir, 'cws.model'), user_dict)
        # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        words = self.segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        print(netags)
        for netag in netags:
            print(netag)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        if (item1.dependency == "ATT"):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT"):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (
                                item.dependency == "ATT"):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(),
                                     key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                print("三元组:(" + item1.lemma + "," + AttWordStr + "," +
                      item2.lemma + ")")
                allTripes.append([item1.lemma, AttWordStr, item2.lemma])
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV" and item1.head_word.postag == "v":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB" and item2.head_word.postag == "v":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr +
                          "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) == 2:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if word.dependency == "VOB" and word.head_word.ID == pred1.ID:
                            flagVOB = False
                    print("pred1:" + pred1.lemma + ",pred2:" + pred2.lemma +
                          ",num:" + str(num))
                    if num == 0:
                        if flagVOB == True:
                            print("DSFN2三元组:(" + item1.lemma + "," +
                                  pred1.lemma + "," + item2.lemma + ")")
                            allTripes.append(
                                [item1.lemma, pred1.lemma, item2.lemma])
                        if flagSBV == True:
                            print("DSFN2三元组:(" + item1.lemma + "," +
                                  pred2.lemma + "," + item2.lemma + ")")
                            allTripes.append(
                                [item1.lemma, pred2.lemma, item2.lemma])
        """
        DSFN3.0
        """
        pred = None
        prep = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            print("DSFN3三元组:(" + item1.lemma + "," +
                                  pred.lemma + "" + objectStr + "," +
                                  item2.lemma + ")")
                            allTripes.append([
                                item1.lemma, pred.lemma + "" + objectStr,
                                item2.lemma
                            ])
                    if object == None:
                        print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma +
                              "," + item2.lemma + ")")
                        allTripes.append(
                            [item1.lemma, pred.lemma, item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP" and prep.head_word.postag == "v":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" +
                          prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([
                        item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma
                    ])
                else:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if word.dependency == "VOB" and word.head_word.ID == pred.ID:
                            flagVOB = False
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        flag = True
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1 != None:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" + prep1.lemma,
                                    item2.lemma
                                ])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            if flagSBV == True:
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                                allTripes.append(
                                    [item1.lemma, pred.lemma, item2.lemma])
                            if flagSBV == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self, verb1, verb2, sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID - 1:
            if self.is_entity(sentence.words[i]):
                num += 1
            i += 1
        return num

    def is_entity(self, entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'i']
        print(entry.lemma + " : " + entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False

    def dsfnAttCOO(self, sentence, item1, item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))

    def dsfn6COO(self, sentence, item1, item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2

    def dsfn5and6COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[
                            2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[
                            0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe

    def dsfnStartCOO3(self, rawSentence, entity1, entity2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        print(sentence.to_string())
        for item in sentence.words:
            if (item.lemma == entity1):
                item1 = item
            if (item.lemma == entity2):
                item2 = item
        if item1.ID > item2.ID:
            c = item1
            item1 = item2
            item2 = c
        itemCopy1 = item1
        itemCopy2 = item2
        allTripes = self.dsfnStartCOO2(sentence, item1, item2)
        if allTripes != None and len(allTripes) == 0:
            if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'
                                ] and item1.dependency == "ATT":
                item1 = item1.head_word
                while item1.dependency == "ATT":
                    item1 = item1.head_word
                if 'n' in item1.postag and item1.postag not in [
                        'nh', 'ns', 'nz', 'ni'
                ]:
                    pass
                else:
                    item1 = itemCopy1

            if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'
                                ] and item2.dependency == "ATT":
                item2 = item2.head_word
                while item2.dependency == "ATT":
                    item2 = item2.head_word
                if ('n' in item2.postag
                        or 'q' in item2.postag) and item2.postag not in [
                            'nh', 'ns', 'nz', 'ni'
                        ]:
                    pass
                else:
                    item2 = itemCopy2
            allTripes = self.dsfnStartCOO2(sentence, item1, item2)
            print("注意")
            print(allTripes)
            if len(allTripes) != 0:
                for tripe in allTripes:
                    if tripe[0] == item1.lemma:
                        tripe[0] = itemCopy1.lemma
                    elif tripe[2] == item1.lemma:
                        tripe[2] = itemCopy1.lemma

                    if tripe[0] == item2.lemma:
                        tripe[0] = itemCopy2.lemma
                    elif tripe[2] == item2.lemma:
                        tripe[2] = itemCopy2.lemma
                    print("12345")
                    resultList.append(tripe)
                print("最终结果")
                print(np.array(set([tuple(t) for t in resultList])))
        else:
            print("最终结果")
            print(np.array(set([tuple(t) for t in allTripes])))

    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        print(item1.lemma)
        print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID
                                == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item

                            print(sentence.to_string())
                            allTripes = self.dsfn1_2_3_4COO(
                                sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(
                                    sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(
                                        sentence, item1, item2)
                                    if allTripes == None or len(
                                            allTripes) == 0:
                                        print("3333333")
                                        allTripes = self.dsfn5and6COO(
                                            sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    # if tripe[0] == item1.lemma:
                                    #     tripe[0] = itemCopy1.lemma
                                    # elif tripe[2] == item1.lemma:
                                    #     tripe[2] = itemCopy1.lemma
                                    #
                                    # if tripe[0] == item2.lemma:
                                    #     tripe[0] = itemCopy2.lemma
                                    # elif tripe[2] == item2.lemma:
                                    #     tripe[2] = itemCopy2.lemma
                                    resultList.append(tripe)
        print(np.array(set([tuple(t) for t in resultList])))
        return resultList