Пример #1
0
 def addState(self, character):
     character = character.encode('utf-8')
     nextState = self.nextStateIgnoreRootState(character)
     if nextState is None:
         nextState = State().init1(self.depth + 1)
         self.success.result[character] = nextState
         self.success = TreeMap(inputDict=self.success.result).sort()
     return nextState
Пример #2
0
 def init2(self, *args):
     """
     创建一个条目,其标签频次都是1,各标签由参数指定
     :param args:
     :return:
     """
     for label in args:
         self.initdict[label] = 1
     self.labelMap = TreeMap(self.initdict)
     return self
Пример #3
0
 def __init__(self):
     # 模式串的长度,也是这个状态的深度
     self.depth = int()
     # 只要这个状态可达,则记录模式串
     self.emits = None
     # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
     self.success = TreeMap({})
     # 在双数组中的对应下标
     self.index = int()
     # fail 函数,如果没有匹配到,则跳转到此状态。
     self.failure = None
Пример #4
0
    def load(self, path):
        self.logger.info("核心词典开始加载:%s" % path)
        print("核心词典开始加载:%s" % path)
        if self.loadDat(path):
            return True

        initdict = OrderedDict()
        try:
            f = open(path, 'r')
            line = ''
            MAX_FREQUENCY = 0
            start = time()
            while 1:
                line = f.readline().strip(' \n\t\r')
                if not line:
                    break
                param = line.split('\t')
                natureCount = int((len(param) - 1) / 2)
                attribute = CoreDictionary.Attribute().init1(natureCount)
                for i in range(natureCount):
                    attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                    attribute.frequency[i] = int(param[2 + 2 * i])
                    attribute.totalFrequency += attribute.frequency[i]
                initdict[param[0]] = attribute
                MAX_FREQUENCY += attribute.totalFrequency
            map = TreeMap(initdict)
            self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            self.trie.build(map)
            self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())
            print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())

            try:
                out = open(self.path + Predefine.BIN_EXT, 'w+')
                attributeList = map.values()
                out.writelines(Convert.convert(len(attributeList)))
                for attribute in attributeList:
                    out.writelines(Convert.convert(attribute.totalFrequency))
                    out.writelines(Convert.convert(len(attribute.nature)))
                    for i in range(len(attribute.nature)):
                        out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i])))
                        out.writelines(Convert.convert(attribute.frequency[i]))

                self.trie.save(out)
                out.close()
            except Exception as e:
                self.logger.warning("保存失败%s" % str(e))
                return False
        except IOError as e:
            self.logger.warning("核心词典%s不存在或读取错误!" % str(e))
            return False
        return True
Пример #5
0
    def load(self, path):
        start = time()
        valueArray = self.onLoadValue(path)
        if valueArray is None:
            self.logger.warning("加载值%s.value.dat失败,耗时%fms" %
                                (path, (time() - start) * 1000))
            return False
        self.logger.info("加载值%s.value.dat成功,耗时%fms" %
                         (path, (time() - start) * 1000))
        print("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000))

        start = time()

        if self.loadDat(path + '.trie.dat', valueArray):
            self.logger.info("加载键%s.trie.dat成功,耗时%fms" %
                             (path, (time() - start) * 1000))
            print("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000))
            return True

        keyList = []

        try:
            br = open(path, 'r')
            while 1:
                line = br.readline().encode('utf-8').strip(' \n\t\r')
                if not line:
                    break
                paraArray = line.split(' ')
                keyList.append(paraArray[0])
        except Exception as e:
            self.logger.warning("读取%s失败%s" % (path, str(e)))
        resultcode = self.trie.kvbuild(keyList, valueArray)

        if resultcode != 0:
            self.logger.warning("trie建立失败%i,正在尝试排序后重载" % resultcode)
            initdict = {}
            map = None
            for i in range(len(list(valueArray))):
                initdict[keyList[i]] = valueArray[i]
            map = TreeMap(initdict).sort()
            self.trie.build(map)
            i = 0
            for v in map.values():
                valueArray[i] = v
                i += 1
        self.trie.save(path + '.trie.dat')
        self.logger.info(path + "加载成功")
        return True
Пример #6
0
 def loadtxt(path, defaultNature, map, customNatureCollector):
     """
     加载用户词典(追加)
     :param path: 词典路径
     :param defaultNature: 默认词性
     :param map:
     :param customNatureCollector: 收集用户词性
     :return:
     """
     try:
         initdict = OrderedDict()
         br = open(path, 'r')
         while 1:
             line = br.readline().encode().strip()
             if not line:
                 break
             param = line.split(" ")
             natureCount = (len(param) - 1) / 2
             attribute = None
             if natureCount == 0:
                 attribute = CoreDictionary.Attribute().init5(defaultNature)
             else:
                 attribute = CoreDictionary.Attribute().init1(natureCount)
                 for i in range(natureCount):
                     attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                     attribute.frequency[i] = int(param[2 + 2 * i])
                     attribute.totalFrequency += attribute.frequency[i]
             initdict[param[0]] = attribute
         map = TreeMap(initdict)
     except Exception, e:
         Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e))
         return False, map
Пример #7
0
class EnumItem(Enum):
    def __init__(self):
        Enum.__init__(self)
        self.initdict = {}
        self.labelMap = TreeMap({})  # Treemap()对象

    def getFrequency(self, label):
        frequency = self.labelMap.get(label)
        if frequency is None:
            return 0
        return frequency

    def init1(self, label, frequency):
        self.initdict[label] = frequency
        self.labelMap = TreeMap(self.initdict)
        return self

    def init2(self, *args):
        """
        创建一个条目,其标签频次都是1,各标签由参数指定
        :param args:
        :return:
        """
        for label in args:
            self.initdict[label] = 1
        self.labelMap = TreeMap(self.initdict)
        return self

    def init3(self, initdict):
        self.initdict = initdict
        self.labelMap = TreeMap(self.initdict)
        return self

    def init4(self, initdict):
        self.initdict = initdict
        self.labelMap = TreeMap(self.initdict)
        return self

    def containsLabel(self, label):
        return label in self.labelMap.result.keys()

    @staticmethod
    def create(param):
        if param is None:
            return None
        array = param.split(' ')
        return EnumItem.create1(array)

    @staticmethod
    def create1(param):
        if len(param) % 2 == 0:
            return None

        natureCount = (len(param) - 1) / 2
        entries = [None] * natureCount
        for i in range(natureCount):
            entries[i] = {param[1 + 2 * i]: int(param[2 + 2 * i])}
        return {param[0]: entries}
 def load(self):
     TranslatedPersonDictionary.trie = DoubleArrayTrie()
     if self.loadDat():
         return True
     initdict = OrderedDict()
     # map = TreeMap({})
     # charFrequencyMap = TreeMap({})
     br = open(TranslatedPersonDictionary.path, 'r')
     while 1:
         line = br.readline().encode().strip()
         if not line:
             break
         initdict[line] = True
         '''
         map.put(line, True)
         print line
         # 音译人名常用字词典自动生成
         for c in line.decode():
             # 排除一些过于常用的字
             if c in "不赞":
                 continue
             f = charFrequencyMap.get(c)
             if f is None:
                 f = 0
             charFrequencyMap.put(c, f + 1)
             print c
         '''
     '''
     map.put(".", True)
     # 将常用字也加进去
     for k, v in charFrequencyMap.items():
         if v < 10:
             continue
         map.put(str(k), True)
         print str(k)
     print "开始排序"
     map.sort()
     print "排序完毕"
     '''
     map = TreeMap(initdict)
     Predefine.logger.info("音译人名词典%s开始构建双数组..." %
                           TranslatedPersonDictionary.path)
     print("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path)
     TranslatedPersonDictionary.trie.build(map)
     Predefine.logger.info("音译人名词典%s开始编译DAT文件..." %
                           TranslatedPersonDictionary.path)
     print("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path)
     Predefine.logger.info(
         "音译人名词典%s编译结果:%s" %
         (TranslatedPersonDictionary.path, self.saveDat()))
     return True
Пример #9
0
    def init(self):
        start = time()
        if not PersonDictionary.dictionary.load(Config.PersonDictionaryPath):
            self.logger.error("人名词典加载失败:%s" % Config.PersonDictionaryPath)
            sys.exit(0)

        PersonDictionary.transformMatrixDictionary.init1(NR)
        PersonDictionary.transformMatrixDictionary.load(
            Config.PersonDictionaryTrPath)

        initdict = {}
        for pattern in NRPattern:
            initdict[str(pattern)] = pattern
        map = TreeMap(initdict).sort()
        PersonDictionary.trie.build(map)
        self.logger.info("%s加载成功,耗时%fms" % (Config.PersonDictionaryPath,
                                            (time() - start) * 1000))
Пример #10
0
 def load(self):
     start = time()
     PlaceDictionary.dictionary.load(Config.PlaceDictionaryPath)
     Predefine.logger.info("%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath,
                                              (time() - start) * 1000))
     print "%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath,
                              (time() - start) * 1000)
     PlaceDictionary.transformMatrixDictionary = PlaceDictionary.transformMatrixDictionary.init1(
         NS)
     PlaceDictionary.transformMatrixDictionary.load(
         Config.PlaceDictionaryTrPath)
     init_dict = {}
     init_dict["CDEH"] = "CDEH"
     init_dict["CDH"] = "CDH"
     init_dict["CH"] = "CH"
     init_dict["GH"] = "GH"
     PlaceDictionary.trie.build(TreeMap(init_dict))
Пример #11
0
class CustomNatureUtility(object):
    Predefine.logger.warning("已激活自定义词性功能,用户需对本地环境的兼容性和稳定性负责!\n")
    extraValueMap = TreeMap({})
    enumBuster = EnumBuster()

    def __init__(self):
        pass

    def addNature(self, name):
        """
        增加词性
        @param name 词性名称
        :return: 词性
        """
        customNature = self.extraValueMap.get(name)
        if customNature != None:
            return customNature
        return customNature
 def load(self):
     JapanesePersonDictionary.trie = DoubleArrayTrie()
     if self.loadDat():
         return True
     initdict = OrderedDict()
     br = open(JapanesePersonDictionary.path, 'r')
     while 1:
         line = br.readline().encode().strip()
         if not line:
             break
         param = line.split(" ")
         initdict[param[0]] = param[1]
     map = TreeMap(initdict)
     Predefine.logger.info("日本人名词典%s开始构建双数组..." % JapanesePersonDictionary.path)
     JapanesePersonDictionary.trie.build(map)
     Predefine.logger.info("日本人名词典%s开始编译DAT文件..." % JapanesePersonDictionary.path)
     Predefine.logger.info("日本人名词典%s编译结果:%s" % (JapanesePersonDictionary.path, str(self.saveDat(map))))
     return True
Пример #13
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print "自定义词典开始加载:%s" % mainPath
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError, e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
Пример #14
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print("自定义词典开始加载:%s" % mainPath)
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p))
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError as e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception as e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
        if map.size() == 0:
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = open(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        #IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True
Пример #15
0
 def init4(self, initdict):
     self.initdict = initdict
     self.labelMap = TreeMap(self.initdict)
     return self
    def load(self, path):
        if self.loadDat(CoreBiGramTableDictionary.datPath):
            return True
        # Treemap对象

        map = TreeMap({})
        # map = dict()
        try:
            br = open(path, 'r')

            line = ""
            total = 0
            maxWordId = CoreDictionary.trie.size1()

            line_num = 1
            while 1:
                line = br.readline().strip("\n\r\t ")
                if not line:
                    break

                params = re.split(' ', line)

                twoWord = params[0].split("@")
                a = twoWord[0]

                idA = CoreDictionary.trie.exactMatchSearch(a)
                if idA == -1:
                    continue
                b = twoWord[1]
                idB = CoreDictionary.trie.exactMatchSearch(b)
                if idB == -1:
                    continue
                freq = int(params[1])
                biMap = map.get(idA)
                if biMap is None:
                    biMap = TreeMap({})

                biMap.put(int(idB), freq)
                map.put(int(idA), biMap)

                total += 2
                line_num += 1

            for k, v in map.items():
                map.put(k, v.sort_long())

            map.sort_long()

            br.close()
            CoreBiGramTableDictionary.start = [int()] * (maxWordId + 1)
            # total是连续的个数*2
            CoreBiGramTableDictionary.pair = [int()] * total
            offset = 0
            for i in range(maxWordId):
                bMap = map.get(i)
                if bMap is not None:
                    for k, v in bMap.items():
                        index = offset << 1
                        CoreBiGramTableDictionary.pair[index] = k
                        CoreBiGramTableDictionary.pair[index + 1] = v
                        offset += 1
                CoreBiGramTableDictionary.start[i + 1] = offset

            self.logger.info("二元词典读取完毕:%s")
        except IOError, e:
            self.logger("二元词典%s不存在或读取错误!%s" % (path, e))
            return False
Пример #17
0
class State(object):
    def __init__(self):
        # 模式串的长度,也是这个状态的深度
        self.depth = int()
        # 只要这个状态可达,则记录模式串
        self.emits = None
        # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
        self.success = TreeMap({})
        # 在双数组中的对应下标
        self.index = int()
        # fail 函数,如果没有匹配到,则跳转到此状态。
        self.failure = None

    def init1(self, depth):
        """
        构造深度为depth的节点
        :param depth:
        :return:
        """
        self.depth = depth
        return self

    def isAcceptable(self):
        """
        是否是终止状态
        :return:
        """
        return self.depth > 0 and self.emits is not None

    def getDepth(self):
        """
        获取节点深度
        :return:
        """
        return self.depth

    def getLargestValueId(self):
        """
        获取最大的值
        :return:
        """
        if self.emits is None or len(self.emits) == 0:
            return None
        return iter(self.emits).next()

    def addEmit(self, keyword):
        """
        添加一个匹配到的模式串(这个状态对应着这个模式串)
        :param keyword:
        :return:
        """
        if self.emits is None:
            # self.emits是倒序排列的treeset
            # this.emits = new TreeSet<Integer>(Collections.reverseOrder());
            self.emits = set()
        self.emits = set(tuple(self.emits))
        self.emits.add(keyword)
        self.emits = sorted(self.emits, reverse=True)

    def addEmit1(self, emits):
        """
        添加一些匹配到的模式串
        :param emits:
        :return:
        """
        for emit in emits:
            self.addEmit(emit)

    def nextStateIgnoreRootState(self, character):
        return self.nextState1(character, True)

    def addState(self, character):
        character = character.encode('utf-8')
        nextState = self.nextStateIgnoreRootState(character)
        if nextState is None:
            nextState = State().init1(self.depth + 1)
            self.success.result[character] = nextState
            self.success = TreeMap(inputDict=self.success.result).sort()
        return nextState

    def getSuccess(self):
        """
        获取goto表
        :return:
        """
        return self.success

    def setIndex(self, index):
        self.index = index

    def getIndex(self):
        return self.index

    def getStates(self):
        return self.success.values()

    def setFailure(self, failState, fail):
        """
        设置failure状态
        :param failState:
        :param fail:
        :return:
        """
        self.failure = failState
        fail[self.index] = failState.index

    def getEmit(self):
        """
        获取这个节点代表的模式串(们)
        :return:
        """
        if self.emits is None:
            return set()
        else:
            return self.emits

    def getTransitions(self):
        return set(self.success.keys())

    def nextState(self, character):
        """
        按照character转移,根节点转移失败会返回自己(永远不会返回null)
        :param character:
        :return:
        """
        return self.nextState1(character, False)

    def nextState1(self, character, ignoreRootState):
        """
        转移到下一个状态
        :param character:希望按此字符转移
        :param ignoreRootState:是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
        :return:转移结果
        """
        nextState = self.success.get(character)
        if not ignoreRootState and nextState is None and self.depth == 0:
            nextState = self
        return nextState

    def getFailure(self):
        """
        获取failure状态
        :return:
        """
        return self.failure
Пример #18
0
 def init1(self, label, frequency):
     self.initdict[label] = frequency
     self.labelMap = TreeMap(self.initdict)
     return self
Пример #19
0
 def __init__(self):
     Enum.__init__(self)
     self.initdict = {}
     self.labelMap = TreeMap({})  # Treemap()对象
Пример #20
0
 def __init__(self):
     self.extraValueMap = TreeMap({})
Пример #21
0
            return False


if __name__ == '__main__':

    inputDict = {
        'aaa': 'aaa',
        'fff': 'fff',
        'bbb': 'bbb',
        '111': '111',
        '11': '11',
        'ccc': 'ddd',
        'ddd': 'ddd',
        'd': 'd'
    }
    tm = TreeMap(inputDict)
    tm.sort()
    # #print type(tm.result)
    # #print type(tm.result.items())
    # for key, value in tm.result.items():
    #     #print key, value
    #print tm.result.items()
    #print 'hdsj'
    trie = DoubleArrayTrie()
    print(trie.size)
    trie.build(tm.result)
    #print trie.size

    DoubleArrayTrie().loadBaseAndCheckByFileChannel(
        "D:/liepin_project/py-segmentation/data/dictionary/person/nr.txt.trie.dat"
    )