def initWordNet1(self, charArray): self.charArray = charArray self.vertexes = [LinkedList()] * (2 + len(charArray)) length = len(self.vertexes) for i in range(length): self.vertexes[i] = LinkedList() self.vertexes[0].append(Vertex.newB()) self.vertexes[length - 1].append(Vertex.newE()) self.size = 2 return self
def hit(self, begin, end, value, wordArray, offsetArray, pd_obj, wordNetOptimum, wordNetAll): """ 命中一个模式串 :param begin: 模式串在母文本中的起始位置 :param end: 模式串在母文本中的终止位置 :param value: 模式串对应的值 :return: 模式串对应的值的下标 """ sbName = '' for i in range(begin, end): sbName += wordArray[i].decode() name = sbName # 对一些bad case 做出调整 for case in Switch(value): if case(NRPattern.BCD): # 姓和最后一个名不可能相等的 if name[0] == name[2]: return break if pd_obj.isBadCase(name): return # print "识别出人名:", name, value offset = offsetArray[begin] wordNetOptimum.insert(offset, Vertex().initVertex(Predefine.TAG_PEOPLE, name, pd_obj.ATTRIBUTE, NRConstant.WORD_ID), wordNetAll)
def GenerateWordNet(self, wordNetStorage): """ 生成一元词网 :@param wordNetStorage """ charArray = wordNetStorage.charArray # 核心词典查询 searcher = CoreDictionary.trie.getSearcher(charArray, 0) while searcher.next_obj(): s = '' for i in range(searcher.begin, searcher.begin + searcher.length): s += charArray[i] q = Vertex().init2(s, searcher.value, searcher.index) wordNetStorage.add(searcher.begin + 1, q) # 原子分词,保证图连通 vertexes = wordNetStorage.getVertexes() i = 1 while i < len(vertexes): if len(vertexes[i]) == 0: j = i + 1 while j < len(vertexes) - 1: if not len(vertexes[j]) == 0: break j += 1 wordNetStorage.add1( i, Segment.quickAtomSegment(charArray, i - 1, j - 1)) i = j else: i += len(vertexes[i][len(vertexes[i]) - 1].realword)
def hit2(self, begin, end, value, pattern, wordArray, od_obj, wordNetOptimum, wordNetAll): sbName = '' for i in range(begin, end): sbName += wordArray[i].decode() name = sbName # 对一些bad case 做出调整 if od_obj.isBadCase(name): return #print "识别出机构名%s %s" % (name, value) offset = 0 for i in range(begin): offset += len(wordArray[i].decode()) wordNetOptimum.insert(offset, Vertex().initVertex(Predefine.TAG_GROUP, name, od_obj.ATTRIBUTE, od_obj.WORD_ID), wordNetAll)
def insertName(name, activeLine, wordNetOptimum, wordNetAll): """ 插入日本人名 :param name: :param activeLine: :param wordNetOptimum: :param wordNetAll: :return: """ if JapanesePersonRecognition.isBadCase(name): return wordNetOptimum.insert( activeLine, Vertex().initVertex(Predefine.TAG_PEOPLE, name, CoreDictionary.Attribute().init5(Nature.nrj), NRConstant.WORD_ID), wordNetAll)
def Recognition(segResult, wordNetOptimum, wordNetAll): """ 执行识别 :param segResult: 粗分结果 :param wordNetOptimum: 粗分结果对应的词图 :param wordNetAll: 全词图 :return: """ sbName = "" appendTimes = 0 i = 0 # i += 1 line = 1 activeLine = 1 while i < len(segResult) - 1: i += 1 vertex = segResult[i] if appendTimes > 0: if vertex.guessNature( ) == Nature.nrf or TranslatedPersonDictionary.containsKey( vertex.realword): sbName += vertex.realword appendTimes += 1 else: # 识别结束 if appendTimes > 1: wordNetOptimum.insert( activeLine, Vertex().initVertex( Predefine.TAG_PEOPLE, sbName, CoreDictionary.Attribute().init5(Nature.nrf), NRConstant.WORD_ID), wordNetAll) sbName = "" appendTimes = 0 else: # nrf和nsf触发识别 if vertex.guessNature() == Nature.nrf or vertex.getNature( ) == Nature.nsf: sbName += vertex.realword appendTimes += 1 activeLine = line line += len(vertex.realword)
def hit1(self, begin, end, value, pattern, wordArray, pld_obj, wordNetOptimum, wordNetAll): """ 命中一个模式串 :param begin: 模式串在母文本中的起始位置 :param end: 模式串在母文本中的终止位置 :param value: 模式串对应的值 :return: 模式串对应的值的下标 """ sbName = '' for i in range(begin, end): sbName += wordArray[i].decode() name = sbName # 对一些bad case 做出调整 if pld_obj.isBadCase(name): return offset = 0 for i in range(begin): offset += len(wordArray[i].decode()) wordNetOptimum.insert(offset, Vertex().initVertex(Predefine.TAG_PLACE, name, pld_obj.ATTRIBUTE, pld_obj.WORD_ID), wordNetAll)
def combineWords(wordNet, start, end, value): """ 将连续的词语合并为一个 :param wordNet: 词图 :param start: 起始下标(包含) :param end: 结束下标(不包含) :param value: 新的属性 :return: """ # 小优化,如果只有一个词,那就不需要合并,直接应用新属性 if start + 1 == end: wordNet[start].attribute = value else: sbTerm = "" for j in range(start, end): if wordNet[j] is None: continue realWord = wordNet[j].realword sbTerm += realWord wordNet[j] = None wordNet[start] = Vertex().init4(sbTerm, value)
def add1(self, line, atomSegment): """ 添加顶点,由原子分词顶点添加 :param line: :param atomSegment: :return: """ # 将原子部分存入m_segGraph offset = 0 # Init the cost array for atomNode in atomSegment: # init the word sWord = atomNode.sWord nature = Nature.n id = -1 for case in Switch(atomNode.nPOS): if case(Predefine.CT_CHINESE): break if case(Predefine.CT_INDEX) or case(Predefine.CT_NUM): nature = Nature.m sWord = '未##数' id = CoreDictionary.M_WORD_ID break if case(Predefine.CT_DELIMITER) or case(Predefine.CT_OTHER): nature = Nature.w break if case(Predefine.CT_SINGLE): nature = Nature.nx sWord = '未##串' id = CoreDictionary.X_WORD_ID break if case(): break # 这些通用符的量级都在10万左右 self.add( line + offset, Vertex().initVertex( sWord, atomNode.sWord, CoreDictionary.Attribute().init3(nature, 10000), id)) offset += len(atomNode.sWord)
def parsePattern(nrList, vertexList, wordNetOptimum, wordNetAll, pd_obj): """ 模式匹配 :param nrList 确定的标注序列 :param vertexList 原始的未加角色标注的序列 :param wordNetOptimum 待优化的图 :param wordNetAll 全词图 """ # 拆分UV # 遍历vertextList的下标 i = -1 sbPattern = "" preNR = NR.A backUp = False index = 0 for nr in nrList: index += 1 i += 1 current = vertexList[i] if nr == NR.U: if not backUp: i = index - 1 backUp = True sbPattern += str(NR.K) sbPattern += str(NR.B) preNR = NR.B nowK = current.realword[0:len(current.realword.decode()) - 1] nowB = current.realword[len(current.realword.decode()) - 1:] vertexList[i] = Vertex().init1(nowK) i += 1 vertexList.insert(i, Vertex().init1(nowB)) continue elif nr == NR.V: if not backUp: i = index - 1 backUp = True if preNR == NR.B: # BE sbPattern += str(NR.E) else: # CD sbPattern += str(NR.D) sbPattern += str(NR.L) # 对串也做一些修改 # i -= 1 nowED = current.realword[len(current.realword) - 1:] nowL = current.realword[0:len(current.realword) - 1] vertexList[i] = Vertex().init1(nowED) vertexList.insert(i, Vertex().init1(nowL)) i += 1 # i += 1 continue else: sbPattern += str(nr) # i += 1 preNR = nr pattern = str(sbPattern) wordList = [] for i in range(len(vertexList)): wordList.append(vertexList[i].realword) wordArray = np.array(wordList) offsetArray = [int()] * len(wordArray) offsetArray[0] = 0 for i in range(1, len(wordArray)): offsetArray[i] = offsetArray[i - 1] + len(wordArray[i - 1]) PersonDictionary.trie.parseText(pattern, wordArray, offsetArray, pd_obj, wordNetOptimum, wordNetAll)