Пример #1
0
 def initWordNet1(self, charArray):
     self.charArray = charArray
     self.vertexes = [LinkedList()] * (2 + len(charArray))
     length = len(self.vertexes)
     for i in range(length):
         self.vertexes[i] = LinkedList()
     self.vertexes[0].append(Vertex.newB())
     self.vertexes[length - 1].append(Vertex.newE())
     self.size = 2
     return self
 def hit(self, begin, end, value, wordArray, offsetArray, pd_obj, wordNetOptimum, wordNetAll):
     """
     命中一个模式串
     :param begin: 模式串在母文本中的起始位置
     :param end:   模式串在母文本中的终止位置
     :param value: 模式串对应的值
     :return:      模式串对应的值的下标
     """
     sbName = ''
     for i in range(begin, end):
         sbName += wordArray[i].decode()
     name = sbName
     # 对一些bad case 做出调整
     for case in Switch(value):
         if case(NRPattern.BCD):
             # 姓和最后一个名不可能相等的
             if name[0] == name[2]:
                 return
             break
     if pd_obj.isBadCase(name):
         return
     # print "识别出人名:", name, value
     offset = offsetArray[begin]
     wordNetOptimum.insert(offset,
                           Vertex().initVertex(Predefine.TAG_PEOPLE, name, pd_obj.ATTRIBUTE, NRConstant.WORD_ID),
                           wordNetAll)
    def GenerateWordNet(self, wordNetStorage):
        """
        生成一元词网
        :@param wordNetStorage
        """
        charArray = wordNetStorage.charArray
        # 核心词典查询
        searcher = CoreDictionary.trie.getSearcher(charArray, 0)
        while searcher.next_obj():
            s = ''
            for i in range(searcher.begin, searcher.begin + searcher.length):
                s += charArray[i]
            q = Vertex().init2(s, searcher.value, searcher.index)
            wordNetStorage.add(searcher.begin + 1, q)
        # 原子分词,保证图连通
        vertexes = wordNetStorage.getVertexes()

        i = 1
        while i < len(vertexes):
            if len(vertexes[i]) == 0:
                j = i + 1
                while j < len(vertexes) - 1:
                    if not len(vertexes[j]) == 0:
                        break
                    j += 1

                wordNetStorage.add1(
                    i, Segment.quickAtomSegment(charArray, i - 1, j - 1))
                i = j
            else:
                i += len(vertexes[i][len(vertexes[i]) - 1].realword)
 def hit2(self, begin, end, value, pattern, wordArray, od_obj, wordNetOptimum, wordNetAll):
     sbName = ''
     for i in range(begin, end):
         sbName += wordArray[i].decode()
     name = sbName
     # 对一些bad case 做出调整
     if od_obj.isBadCase(name):
         return
     #print "识别出机构名%s %s" % (name, value)
     offset = 0
     for i in range(begin):
         offset += len(wordArray[i].decode())
     wordNetOptimum.insert(offset,
                           Vertex().initVertex(Predefine.TAG_GROUP, name, od_obj.ATTRIBUTE, od_obj.WORD_ID),
                           wordNetAll)
Пример #5
0
 def insertName(name, activeLine, wordNetOptimum, wordNetAll):
     """
     插入日本人名
     :param name:
     :param activeLine:
     :param wordNetOptimum:
     :param wordNetAll:
     :return:
     """
     if JapanesePersonRecognition.isBadCase(name):
         return
     wordNetOptimum.insert(
         activeLine,
         Vertex().initVertex(Predefine.TAG_PEOPLE, name,
                             CoreDictionary.Attribute().init5(Nature.nrj),
                             NRConstant.WORD_ID), wordNetAll)
Пример #6
0
 def Recognition(segResult, wordNetOptimum, wordNetAll):
     """
     执行识别
     :param segResult: 粗分结果
     :param wordNetOptimum: 粗分结果对应的词图
     :param wordNetAll: 全词图
     :return:
     """
     sbName = ""
     appendTimes = 0
     i = 0
     # i += 1
     line = 1
     activeLine = 1
     while i < len(segResult) - 1:
         i += 1
         vertex = segResult[i]
         if appendTimes > 0:
             if vertex.guessNature(
             ) == Nature.nrf or TranslatedPersonDictionary.containsKey(
                     vertex.realword):
                 sbName += vertex.realword
                 appendTimes += 1
             else:
                 # 识别结束
                 if appendTimes > 1:
                     wordNetOptimum.insert(
                         activeLine,
                         Vertex().initVertex(
                             Predefine.TAG_PEOPLE, sbName,
                             CoreDictionary.Attribute().init5(Nature.nrf),
                             NRConstant.WORD_ID), wordNetAll)
                 sbName = ""
                 appendTimes = 0
         else:
             # nrf和nsf触发识别
             if vertex.guessNature() == Nature.nrf or vertex.getNature(
             ) == Nature.nsf:
                 sbName += vertex.realword
                 appendTimes += 1
                 activeLine = line
         line += len(vertex.realword)
 def hit1(self, begin, end, value, pattern, wordArray, pld_obj, wordNetOptimum, wordNetAll):
     """
     命中一个模式串
     :param begin: 模式串在母文本中的起始位置
     :param end:   模式串在母文本中的终止位置
     :param value: 模式串对应的值
     :return:      模式串对应的值的下标
     """
     sbName = ''
     for i in range(begin, end):
         sbName += wordArray[i].decode()
     name = sbName
     # 对一些bad case 做出调整
     if pld_obj.isBadCase(name):
         return
     offset = 0
     for i in range(begin):
         offset += len(wordArray[i].decode())
     wordNetOptimum.insert(offset,
                           Vertex().initVertex(Predefine.TAG_PLACE, name, pld_obj.ATTRIBUTE, pld_obj.WORD_ID),
                           wordNetAll)
Пример #8
0
 def combineWords(wordNet, start, end, value):
     """
     将连续的词语合并为一个
     :param wordNet: 词图
     :param start: 起始下标(包含)
     :param end: 结束下标(不包含)
     :param value: 新的属性
     :return:
     """
     # 小优化,如果只有一个词,那就不需要合并,直接应用新属性
     if start + 1 == end:
         wordNet[start].attribute = value
     else:
         sbTerm = ""
         for j in range(start, end):
             if wordNet[j] is None:
                 continue
             realWord = wordNet[j].realword
             sbTerm += realWord
             wordNet[j] = None
         wordNet[start] = Vertex().init4(sbTerm, value)
Пример #9
0
 def add1(self, line, atomSegment):
     """
     添加顶点,由原子分词顶点添加
     :param line:
     :param atomSegment:
     :return:
     """
     # 将原子部分存入m_segGraph
     offset = 0
     # Init the cost array
     for atomNode in atomSegment:
         # init the word
         sWord = atomNode.sWord
         nature = Nature.n
         id = -1
         for case in Switch(atomNode.nPOS):
             if case(Predefine.CT_CHINESE):
                 break
             if case(Predefine.CT_INDEX) or case(Predefine.CT_NUM):
                 nature = Nature.m
                 sWord = '未##数'
                 id = CoreDictionary.M_WORD_ID
                 break
             if case(Predefine.CT_DELIMITER) or case(Predefine.CT_OTHER):
                 nature = Nature.w
                 break
             if case(Predefine.CT_SINGLE):
                 nature = Nature.nx
                 sWord = '未##串'
                 id = CoreDictionary.X_WORD_ID
                 break
             if case():
                 break
         # 这些通用符的量级都在10万左右
         self.add(
             line + offset,
             Vertex().initVertex(
                 sWord, atomNode.sWord,
                 CoreDictionary.Attribute().init3(nature, 10000), id))
         offset += len(atomNode.sWord)
Пример #10
0
    def parsePattern(nrList, vertexList, wordNetOptimum, wordNetAll, pd_obj):
        """
        模式匹配
        :param nrList         确定的标注序列
        :param vertexList     原始的未加角色标注的序列
        :param wordNetOptimum 待优化的图
        :param wordNetAll     全词图
        """
        # 拆分UV
        # 遍历vertextList的下标
        i = -1
        sbPattern = ""
        preNR = NR.A
        backUp = False
        index = 0
        for nr in nrList:
            index += 1
            i += 1
            current = vertexList[i]
            if nr == NR.U:
                if not backUp:
                    i = index - 1
                    backUp = True
                sbPattern += str(NR.K)
                sbPattern += str(NR.B)
                preNR = NR.B

                nowK = current.realword[0:len(current.realword.decode()) - 1]
                nowB = current.realword[len(current.realword.decode()) - 1:]
                vertexList[i] = Vertex().init1(nowK)

                i += 1
                vertexList.insert(i, Vertex().init1(nowB))
                continue
            elif nr == NR.V:
                if not backUp:
                    i = index - 1
                    backUp = True
                if preNR == NR.B:
                    # BE
                    sbPattern += str(NR.E)
                else:
                    # CD
                    sbPattern += str(NR.D)
                sbPattern += str(NR.L)
                # 对串也做一些修改
                # i -= 1
                nowED = current.realword[len(current.realword) - 1:]
                nowL = current.realword[0:len(current.realword) - 1]
                vertexList[i] = Vertex().init1(nowED)
                vertexList.insert(i, Vertex().init1(nowL))
                i += 1
                # i += 1
                continue
            else:

                sbPattern += str(nr)

            # i += 1
            preNR = nr

        pattern = str(sbPattern)
        wordList = []
        for i in range(len(vertexList)):
            wordList.append(vertexList[i].realword)
        wordArray = np.array(wordList)

        offsetArray = [int()] * len(wordArray)
        offsetArray[0] = 0

        for i in range(1, len(wordArray)):
            offsetArray[i] = offsetArray[i - 1] + len(wordArray[i - 1])

        PersonDictionary.trie.parseText(pattern, wordArray, offsetArray,
                                        pd_obj, wordNetOptimum, wordNetAll)