def loadtxt(path, defaultNature, map, customNatureCollector): """ 加载用户词典(追加) :param path: 词典路径 :param defaultNature: 默认词性 :param map: :param customNatureCollector: 收集用户词性 :return: """ try: initdict = OrderedDict() br = open(path, 'r') while 1: line = br.readline().encode().strip() if not line: break param = line.split(" ") natureCount = (len(param) - 1) / 2 attribute = None if natureCount == 0: attribute = CoreDictionary.Attribute().init5(defaultNature) else: attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute map = TreeMap(initdict) except Exception, e: Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e)) return False, map
def loadDat(path): """ 从磁盘加载双数组 :param path: :return: """ try: byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb')) except Exception as e: byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT) out = open(path + Predefine.PIC_EXT, 'wb') pickle.dump(byteArray, out) if byteArray is None: return False size = byteArray.nextInt() # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性 if size < 0: pass attributes = [None] * size natureIndexArray = list(Nature) for i in range(size): # 第一个是全部词频,第二个是词性个数 currentTotalFrequency = byteArray.nextInt() length = byteArray.nextInt() attributes[i] = CoreDictionary.Attribute().init1(length) attributes[i].totalFrequency = currentTotalFrequency for j in range(length): attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()] attributes[i].frequency[j] = byteArray.nextInt() if not CustomDictionary.dat.load(byteArray, attributes): return False return True
def getAttribute(word): """ 获取某个单词的词频 :@param word :@return """ attribute = CoreDictionary.get(word) if attribute is not None: return attribute return CustomDictionary.get(word)
def initVertex(self, word, realWord, attribute, wordID): if attribute is None: attribute = CoreDictionary.Attribute().init3(Nature.n, 1) self.wordID = wordID self.attribute = attribute if word is None: word = self.compileRealWord(realWord, attribute) assert len(realWord) > 0 self.word = word self.realword = realWord.decode() return self
def insertName(name, activeLine, wordNetOptimum, wordNetAll): """ 插入日本人名 :param name: :param activeLine: :param wordNetOptimum: :param wordNetAll: :return: """ if JapanesePersonRecognition.isBadCase(name): return wordNetOptimum.insert( activeLine, Vertex().initVertex(Predefine.TAG_PEOPLE, name, CoreDictionary.Attribute().init5(Nature.nrj), NRConstant.WORD_ID), wordNetAll)
def Recognition(segResult, wordNetOptimum, wordNetAll): """ 执行识别 :param segResult: 粗分结果 :param wordNetOptimum: 粗分结果对应的词图 :param wordNetAll: 全词图 :return: """ sbName = "" appendTimes = 0 i = 0 # i += 1 line = 1 activeLine = 1 while i < len(segResult) - 1: i += 1 vertex = segResult[i] if appendTimes > 0: if vertex.guessNature( ) == Nature.nrf or TranslatedPersonDictionary.containsKey( vertex.realword): sbName += vertex.realword appendTimes += 1 else: # 识别结束 if appendTimes > 1: wordNetOptimum.insert( activeLine, Vertex().initVertex( Predefine.TAG_PEOPLE, sbName, CoreDictionary.Attribute().init5(Nature.nrf), NRConstant.WORD_ID), wordNetAll) sbName = "" appendTimes = 0 else: # nrf和nsf触发识别 if vertex.guessNature() == Nature.nrf or vertex.getNature( ) == Nature.nsf: sbName += vertex.realword appendTimes += 1 activeLine = line line += len(vertex.realword)
def add1(self, line, atomSegment): """ 添加顶点,由原子分词顶点添加 :param line: :param atomSegment: :return: """ # 将原子部分存入m_segGraph offset = 0 # Init the cost array for atomNode in atomSegment: # init the word sWord = atomNode.sWord nature = Nature.n id = -1 for case in Switch(atomNode.nPOS): if case(Predefine.CT_CHINESE): break if case(Predefine.CT_INDEX) or case(Predefine.CT_NUM): nature = Nature.m sWord = '未##数' id = CoreDictionary.M_WORD_ID break if case(Predefine.CT_DELIMITER) or case(Predefine.CT_OTHER): nature = Nature.w break if case(Predefine.CT_SINGLE): nature = Nature.nx sWord = '未##串' id = CoreDictionary.X_WORD_ID break if case(): break # 这些通用符的量级都在10万左右 self.add( line + offset, Vertex().initVertex( sWord, atomNode.sWord, CoreDictionary.Attribute().init3(nature, 10000), id)) offset += len(atomNode.sWord)
class NRConstant(object): # 本词典专注的词的ID WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PEOPLE) # 本词典专注的词的属性 ATTRIBUTE = CoreDictionary.get2(WORD_ID)
class PlaceDictionary(object): # 地名词典 dictionary = NSDictionary() # 转移矩阵词典 transformMatrixDictionary = TransformMatrixDictionary() # AC算法用到的Trie树 trie = AhoCorasickDoubleArrayTrie() # 本词典专注的词的ID WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE) # 本词典专注的词的属性 ATTRIBUTE = CoreDictionary.get2(WORD_ID) def __init__(self): self.load() def load(self): start = time() PlaceDictionary.dictionary.load(Config.PlaceDictionaryPath) Predefine.logger.info("%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath, (time() - start) * 1000)) print "%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath, (time() - start) * 1000) PlaceDictionary.transformMatrixDictionary = PlaceDictionary.transformMatrixDictionary.init1( NS) PlaceDictionary.transformMatrixDictionary.load( Config.PlaceDictionaryTrPath) init_dict = {} init_dict["CDEH"] = "CDEH" init_dict["CDH"] = "CDH" init_dict["CH"] = "CH" init_dict["GH"] = "GH" PlaceDictionary.trie.build(TreeMap(init_dict)) @staticmethod def parsePattern(nsList, vertexList, wordNetOptimum, wordNetAll, pld_obj): """ 模式匹配 :param nsList: 确定的标注序列 :param wordNetOptimum: 原始的未加角色标注的序列 :param wordNetAll: 待优化的图 :return: """ sbPattern = "" for ns in nsList: sbPattern += str(ns) pattern = str(sbPattern) wordList = [] for i in range(len(vertexList)): wordList.append(vertexList[i].realword) wordArray = np.array(wordList) PlaceDictionary.trie.parseText1(pattern, wordArray, pld_obj, wordNetOptimum, wordNetAll) @staticmethod def isBadCase(name): """ 因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中<BR> 这个方法返回是否是bad case :param name: :return: """ nrEnumItem = None place_list = PlaceDictionary.dictionary.get(name) if place_list is not None: initdict = dict(place_list) nrEnumItem = EnumItem().init3(initdict) if nrEnumItem is None: return False return nrEnumItem.containsLabel(NS.Z)
class PersonDictionary(object): # 人名词典 dictionary = NRDictionary() # 转移矩阵词典 transformMatrixDictionary = TransformMatrixDictionary() # AC算法用到的Trie树 trie = AhoCorasickDoubleArrayTrie() ATTRIBUTE = CoreDictionary.Attribute().init3(Nature.nr, 100) def __init__(self): self.logger = Predefine.logger self.wordArray = None self.offsetArray = None self.wordNetOptimum = None self.wordNetAll = None self.init() def init(self): start = time() if not PersonDictionary.dictionary.load(Config.PersonDictionaryPath): self.logger.error("人名词典加载失败:%s" % Config.PersonDictionaryPath) sys.exit(0) PersonDictionary.transformMatrixDictionary.init1(NR) PersonDictionary.transformMatrixDictionary.load( Config.PersonDictionaryTrPath) initdict = {} for pattern in NRPattern: initdict[str(pattern)] = pattern map = TreeMap(initdict).sort() PersonDictionary.trie.build(map) self.logger.info("%s加载成功,耗时%fms" % (Config.PersonDictionaryPath, (time() - start) * 1000)) @staticmethod def parsePattern(nrList, vertexList, wordNetOptimum, wordNetAll, pd_obj): """ 模式匹配 :param nrList 确定的标注序列 :param vertexList 原始的未加角色标注的序列 :param wordNetOptimum 待优化的图 :param wordNetAll 全词图 """ # 拆分UV # 遍历vertextList的下标 i = -1 sbPattern = "" preNR = NR.A backUp = False index = 0 for nr in nrList: index += 1 i += 1 current = vertexList[i] if nr == NR.U: if not backUp: i = index - 1 backUp = True sbPattern += str(NR.K) sbPattern += str(NR.B) preNR = NR.B nowK = current.realword[0:len(current.realword.decode()) - 1] nowB = current.realword[len(current.realword.decode()) - 1:] vertexList[i] = Vertex().init1(nowK) i += 1 vertexList.insert(i, Vertex().init1(nowB)) continue elif nr == NR.V: if not backUp: i = index - 1 backUp = True if preNR == NR.B: # BE sbPattern += str(NR.E) else: # CD sbPattern += str(NR.D) sbPattern += str(NR.L) # 对串也做一些修改 # i -= 1 nowED = current.realword[len(current.realword) - 1:] nowL = current.realword[0:len(current.realword) - 1] vertexList[i] = Vertex().init1(nowED) vertexList.insert(i, Vertex().init1(nowL)) i += 1 # i += 1 continue else: sbPattern += str(nr) # i += 1 preNR = nr pattern = str(sbPattern) wordList = [] for i in range(len(vertexList)): wordList.append(vertexList[i].realword) wordArray = np.array(wordList) offsetArray = [int()] * len(wordArray) offsetArray[0] = 0 for i in range(1, len(wordArray)): offsetArray[i] = offsetArray[i - 1] + len(wordArray[i - 1]) PersonDictionary.trie.parseText(pattern, wordArray, offsetArray, pd_obj, wordNetOptimum, wordNetAll) def isBadCase(self, name): """ 因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中<BR> 这个方法返回人名是否是bad case :param name: :return: """ nrEnumItem = None name_list = PersonDictionary.dictionary.get(name) if name_list is not None: initdict = dict(name_list) nrEnumItem = EnumItem().init3(initdict) if nrEnumItem is None: return False return nrEnumItem.containsLabel(NR.A)
out = file(path + Predefine.PIC_EXT, 'wb') cPickle.dump(byteArray, out) if byteArray is None: return False size = byteArray.nextInt() # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性 if size < 0: pass attributes = [None] * size natureIndexArray = list(Nature) for i in range(size): # 第一个是全部词频,第二个是词性个数 currentTotalFrequency = byteArray.nextInt() length = byteArray.nextInt() attributes[i] = CoreDictionary.Attribute().init1(length) attributes[i].totalFrequency = currentTotalFrequency for j in range(length): attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()] attributes[i].frequency[j] = byteArray.nextInt() if not CustomDictionary.dat.load(byteArray, attributes): return False return True @staticmethod def get(key): attribute = CustomDictionary.dat.get(key) if attribute is not None: return attribute if CustomDictionary.trie is None:
def newE(): return Vertex().initVertex( Predefine.TAG_END, ' ', CoreDictionary.Attribute().init3(Nature.end, Predefine.MAX_FREQUENCY / 10), CoreDictionary.getWordID(Predefine.TAG_END))
def newB(): return Vertex().initVertex( Predefine.TAG_BIGIN, ' ', CoreDictionary.Attribute().init3(Nature.begin, Predefine.MAX_FREQUENCY / 10), CoreDictionary.getWordID(Predefine.TAG_BIGIN))
class Vertex(object): """ 顶点 """ cd = CoreDictionary() cbtd = CoreBiGramTableDictionary() def __init__(self): # 节点对应的词或等效词(如未##数) self.word = '' # 节点对应的真实词,绝对不含## self.realword = '' # 词的属性,谨慎修改属性内部的数据,因为会影响到字典 # 如果要修改,应当new一个Attribute self.attribute = None # 等效词ID,也是Attribute的下标 self.wordID = int() # 在一维顶点数组中的下标,可以视作这个顶点的id self.index = int() # 到该节点的最短路径的前驱节点 self.fromnode = None # 最短路径对应的权重 self.weight = float() @staticmethod def newB(): return Vertex().initVertex( Predefine.TAG_BIGIN, ' ', CoreDictionary.Attribute().init3(Nature.begin, Predefine.MAX_FREQUENCY / 10), CoreDictionary.getWordID(Predefine.TAG_BIGIN)) @staticmethod def newE(): return Vertex().initVertex( Predefine.TAG_END, ' ', CoreDictionary.Attribute().init3(Nature.end, Predefine.MAX_FREQUENCY / 10), CoreDictionary.getWordID(Predefine.TAG_END)) def updateFrom(self, fromnode): weight = fromnode.weight + MathTools.calculateWeight(fromnode, self) if self.fromnode is None or self.weight > weight: self.fromnode = fromnode self.weight = weight def initVertex(self, word, realWord, attribute, wordID): if attribute is None: attribute = CoreDictionary.Attribute().init3(Nature.n, 1) self.wordID = wordID self.attribute = attribute if word is None: word = self.compileRealWord(realWord, attribute) assert len(realWord) > 0 self.word = word self.realword = realWord.decode() return self def init1(self, realWord): """ 自动构造一个合理的顶点 :param realword: :return: """ return self.initVertex(None, realWord, Vertex.cd.get(realWord), -1) def init2(self, realWord, attribute, wordID): return self.initVertex(None, realWord, attribute, wordID) def init3(self, word, realWord, attribute): """ 最复杂的构造函数 :param word: 编译后的词 :param realWord: 真实词 :param attribute: 属性 :return: """ return self.initVertex(word, realWord, attribute, -1) def init4(self, realWord, attribute): """ 真实词与编译词相同时候的构造函数 :param realWord: :param attribute: :return: """ return self.init3(None, realWord, attribute) def compileRealWord(self, realword, attribute): if len(attribute.nature) == 1: for case in Switch(attribute.nature[0]): if case(Nature.nr) or case(Nature.nr1) or case( Nature.nr2) or case(Nature.nrf) or case(Nature.nrj): self.wordID = Vertex.cd.NR_WORD_ID return Predefine.TAG_PEOPLE if case(Nature.ns) or case(Nature.nsf): self.wordID = Vertex.cd.NS_WORD_ID return Predefine.TAG_PLACE if case(Nature.nx): self.wordID = Vertex.cd.NX_WORD_ID self.attribute = Vertex.cd.get1(Vertex.cd.NX_WORD_ID) return Predefine.TAG_PROPER if case(Nature.nt) or case(Nature.ntc) or case( Nature.ntcf) or case(Nature.ntcb) or case( Nature.ntch) or case(Nature.nto) or case( Nature.ntu) or case(Nature.nts) or case( Nature.nth) or case(Nature.nit): self.wordID = Vertex.cd.NT_WORD_ID # self.attribute = Vertex.cd.get1(Vertex.cd.NT_WORD_ID) return Predefine.TAG_GROUP if case(Nature.m) or case(Nature.mq): self.wordID = Vertex.cd.M_WORD_ID self.attribute = Vertex.cd.get1(Vertex.cd.M_WORD_ID) return Predefine.TAG_NUMBER if case(Nature.x): self.wordID = Vertex.cd.X_WORD_ID self.attribute = Vertex.cd.get1(Vertex.cd.X_WORD_ID) return Predefine.TAG_CLUSTER if case(Nature.t): self.wordID = Vertex.cd.T_WORD_ID self.attribute = Vertex.cd.get1(Vertex.cd.T_WORD_ID) return Predefine.TAG_TIME return realword def getNature(self): """ 获取该节点的词性,如果词性还未确定,则返回null :return: """ if len(self.attribute.nature) == 1: return self.attribute.nature[0] return None def guessNature(self): """ 猜测最可能的词性,也就是这个节点的词性中出现频率最大的那一个词性 :return: """ return self.attribute.nature[0] def getAttribute(self): """ 获取词的属性 :return: """ return self.attribute
traceFailureState = traceFailureState.getFailure() newFailureState = traceFailureState.nextState(transition) targetState.setFailure(newFailureState, self.ac.fail) targetState.addEmit1(newFailureState.getEmit()) self.constructOutput(targetState) def loseWeight(self): """ 释放空闲的内存 :return: """ nbase = [int()] * (self.ac.size + 65535) nbase[:self.ac.size] = self.ac.base[:self.ac.size] self.ac.base = nbase ncheck = [int()] * (self.ac.size + 65535) ncheck[:self.ac.size] = self.ac.check[:self.ac.size] self.ac.check = ncheck # 跳出外层循环类 class Getoutofloop(Exception): pass if __name__ == "__main__": a = CoreDictionary() ac = AhoCorasickDoubleArrayTrie() ac.Builder(ac).resize(9)