示例#1
0
class CustomDictionary(object):
    # 用于储存用户动态插入词条的二分trie树
    trie = BinTrie()
    dat = DoubleArrayTrie()
    # 第一个是主词典,其他是副词典
    path = Config.CustomDictionaryPath

    start = time()

    def __init__(self):
        CustomDictionary.load()
        pass

    @staticmethod
    def load():
        """
        自动加载词典
        :param mainPath:
        :return:
        """
        start = time()
        if not CustomDictionary.loadMainDictionary(CustomDictionary.path[0]):
            Predefine.logger.warning("自定义词典%s加载失败" %
                                     (" ".join(CustomDictionary.path)))
        else:
            Predefine.logger.info("自定义词典加载成功:%i个词条,耗时%fms" %
                                  (CustomDictionary.dat.size,
                                   (time() - start) * 1000))
            print "自定义词典%s加载成功:%i个词条,耗时%fms" % (" ".join(
                CustomDictionary.path), CustomDictionary.dat.size1(),
                                                (time() - start) * 1000)
        return True

    @staticmethod
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print "自定义词典开始加载:%s" % mainPath
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError, e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception, e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
 def load(self):
     TranslatedPersonDictionary.trie = DoubleArrayTrie()
     if self.loadDat():
         return True
     initdict = OrderedDict()
     # map = TreeMap({})
     # charFrequencyMap = TreeMap({})
     br = open(TranslatedPersonDictionary.path, 'r')
     while 1:
         line = br.readline().encode().strip()
         if not line:
             break
         initdict[line] = True
         '''
         map.put(line, True)
         print line
         # 音译人名常用字词典自动生成
         for c in line.decode():
             # 排除一些过于常用的字
             if c in "不赞":
                 continue
             f = charFrequencyMap.get(c)
             if f is None:
                 f = 0
             charFrequencyMap.put(c, f + 1)
             print c
         '''
     '''
     map.put(".", True)
     # 将常用字也加进去
     for k, v in charFrequencyMap.items():
         if v < 10:
             continue
         map.put(str(k), True)
         print str(k)
     print "开始排序"
     map.sort()
     print "排序完毕"
     '''
     map = TreeMap(initdict)
     Predefine.logger.info("音译人名词典%s开始构建双数组..." %
                           TranslatedPersonDictionary.path)
     print("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path)
     TranslatedPersonDictionary.trie.build(map)
     Predefine.logger.info("音译人名词典%s开始编译DAT文件..." %
                           TranslatedPersonDictionary.path)
     print("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path)
     Predefine.logger.info(
         "音译人名词典%s编译结果:%s" %
         (TranslatedPersonDictionary.path, self.saveDat()))
     return True
示例#3
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print("自定义词典开始加载:%s" % mainPath)
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p))
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError as e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception as e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
        if map.size() == 0:
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = open(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        #IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True
 def load(self):
     JapanesePersonDictionary.trie = DoubleArrayTrie()
     if self.loadDat():
         return True
     initdict = OrderedDict()
     br = open(JapanesePersonDictionary.path, 'r')
     while 1:
         line = br.readline().encode().strip()
         if not line:
             break
         param = line.split(" ")
         initdict[param[0]] = param[1]
     map = TreeMap(initdict)
     Predefine.logger.info("日本人名词典%s开始构建双数组..." % JapanesePersonDictionary.path)
     JapanesePersonDictionary.trie.build(map)
     Predefine.logger.info("日本人名词典%s开始编译DAT文件..." % JapanesePersonDictionary.path)
     Predefine.logger.info("日本人名词典%s编译结果:%s" % (JapanesePersonDictionary.path, str(self.saveDat(map))))
     return True
示例#5
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print "自定义词典开始加载:%s" % mainPath
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError, e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
示例#6
0
class CoreDictionary(object):
    """
    使用DoubleArrayTrie实现的核心词典
    """

    class Attribute(object):
        """
        核心词典中的词属性
        """

        def __init__(self):
            # 词性列表
            self.nature = [Nature]
            # 词性对应的词频
            self.frequency = [int()]
            self.totalFrequency = int()
            self.logger = Predefine.logger

        def init1(self, size):
            self.nature = [Nature] * int(size)
            self.frequency = [int()] * int(size)
            return self

        def init2(self, nature, frequency):
            self.nature = nature
            self.frequency = frequency
            return self

        def init3(self, nature, frequency):
            self.init1(1)
            self.nature[0] = nature
            self.frequency[0] = frequency
            self.totalFrequency = frequency
            return self

        def init4(self, nature, frequency, totalFrequency):
            self.nature = nature
            self.frequency = frequency
            self.totalFrequency = totalFrequency

        def init5(self, nature):
            """
            使用单个词性,默认词频1000构造
            :param nature:
            :return:
            """
            return self.init3(nature, 1000)

        def create(self, natureWithFrequency):
            try:
                param = natureWithFrequency.strip().split(' ')
                natureCount = len(param) / 2
                attribute = CoreDictionary.Attribute().init1(natureCount)
                for i in range(natureCount):
                    attribute.nature[i] = Nature.valueOf(param[2 * i])
                    attribute.frequency[i] = int(param[1 + 2 * i])
                    attribute.totalFrequency += attribute.frequency[i]
                return attribute
            except:
                self.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!")
                return None

        def bcreate(self, byteArray, natureIndexArray):
            """
            从字节流中加载
            @ param byteArray
            @ param natureIndexArray
            :return:
            """
            currentTotalFrequency = byteArray.nextInt()
            length = byteArray.nextInt()
            attribute = CoreDictionary.Attribute().init1(length)
            attribute.totalFrequency = currentTotalFrequency
            for i in range(length):
                attribute.nature[i] = natureIndexArray[byteArray.nextInt()]
                attribute.frequency[i] = byteArray.nextInt()

            return attribute

        def getNatureFrequency(self, nature):
            """
             获取词性的词频
             @param nature 词性
            :return: 词频
            """
            i = 0
            for pos in self.nature:
                if nature == pos:
                    return self.frequency[i]
                i += 1
            return 0

        def hasNature(self, nature):
            """
            判断是否有某个词性
            @param nature
            :return: boolean
            """
            return self.getNatureFrequency(nature) > 0

        def hasNatureStartsWith(self, prefix):
            """
            是否有以某个前缀开头的词性
            :param prefix: 词性前缀,比如u会查询是否有ude, uzhe等等
            :return: boolean
            """
            for n in self.nature:
                if n.startsWith(prefix):  # ???
                    return True
            return False

        def toString(self):
            """
            nature and frequency to string
            :return: toString result
            """
            result = ""
            for i in range(len(self.nature)):
                result += "%s %s " % (str(self.nature[i]), str(self.frequency[i]))
            return result

        def save(self, out):
            out.writelines(Convert.convert(self.totalFrequency))
            out.writelines(Convert.convert(len(self.nature)))
            for i in range(len(self.nature)):
                out.writelines(Convert.convert(Nature.ordinal(self.nature[i])))
                out.writelines(Convert.convert(self.frequency[i]))

    trie = DoubleArrayTrie()
    attribute = Attribute()
    NR_WORD_ID = None
    NS_WORD_ID = None
    NT_WORD_ID = None
    T_WORD_ID = None
    X_WORD_ID = None
    M_WORD_ID = None
    NX_WORD_ID = None

    def __init__(self):
        self.path = Config.CoreDictionaryPath
        self.logger = Predefine.logger
        self.totalFrequency = 221894

        # 自动加载词典
        start = time()
        if not self.load(self.path):
            self.logger.error('核心词典%s加载失败' % self.path)
            sys.exit(1)
        else:
            end = time()
            self.logger.info('%s加载成功%i个词条,耗时%fms' % (self.path, 2, (end - start) * 1000))

        # 一些特殊的WORD_ID
        CoreDictionary.NR_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PEOPLE)
        CoreDictionary.NS_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE)
        CoreDictionary.NT_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_GROUP)
        CoreDictionary.T_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_TIME)
        CoreDictionary.X_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_CLUSTER)
        CoreDictionary.M_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_NUMBER)
        CoreDictionary.NX_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PROPER)

    @staticmethod
    def get(key):
        """
        获取条目
        :@param key
        :@return
        """
        return CoreDictionary.trie.get2(key)

    def get1(self, wordID):
        """
        获取条目
        :@param key
        :@return
        """
        return self.trie.get(wordID)

    @staticmethod
    def getWordID(a):
        """
        获取词语的ID
        :param a: 词语
        :return: ID,如果不存在,则返回-1
        """
        return CoreDictionary.trie.exactMatchSearch(a)

    def load(self, path):
        self.logger.info("核心词典开始加载:%s" % path)
        print("核心词典开始加载:%s" % path)
        if self.loadDat(path):
            return True

        initdict = OrderedDict()
        try:
            f = open(path, 'r')
            line = ''
            MAX_FREQUENCY = 0
            start = time()
            while 1:
                line = f.readline().strip(' \n\t\r')
                if not line:
                    break
                param = line.split('\t')
                natureCount = int((len(param) - 1) / 2)
                attribute = CoreDictionary.Attribute().init1(natureCount)
                for i in range(natureCount):
                    attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                    attribute.frequency[i] = int(param[2 + 2 * i])
                    attribute.totalFrequency += attribute.frequency[i]
                initdict[param[0]] = attribute
                MAX_FREQUENCY += attribute.totalFrequency
            map = TreeMap(initdict)
            self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            self.trie.build(map)
            self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())
            print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())

            try:
                out = open(self.path + Predefine.BIN_EXT, 'w+')
                attributeList = map.values()
                out.writelines(Convert.convert(len(attributeList)))
                for attribute in attributeList:
                    out.writelines(Convert.convert(attribute.totalFrequency))
                    out.writelines(Convert.convert(len(attribute.nature)))
                    for i in range(len(attribute.nature)):
                        out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i])))
                        out.writelines(Convert.convert(attribute.frequency[i]))

                self.trie.save(out)
                out.close()
            except Exception as e:
                self.logger.warning("保存失败%s" % str(e))
                return False
        except IOError as e:
            self.logger.warning("核心词典%s不存在或读取错误!" % str(e))
            return False
        return True

    def loadDat(self, path):
        """
        从磁盘加载双数组
        :param path:
        :return:
        """
        start = time()
        try:
            try:
                byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb'))
            except Exception as e:
                byteArray = ByteArray().createByteArray(path + Predefine.BIN_EXT)
                out = open(path + Predefine.PIC_EXT, 'wb')
                pickle.dump(byteArray, out)
            if byteArray is None:
                return False
            size = byteArray.nextInt()
            # 列表,存储Attribute对象
            attributes = [None] * size
            natureIndexArray = list(Nature)
            for i in range(size):
                # 第一个是全部频次,第二个是词性个数
                currentTotalFrequency = byteArray.nextInt()
                length = byteArray.nextInt()
                attributes[i] = CoreDictionary.Attribute().init1(length)
                attributes[i].totalFrequency = currentTotalFrequency
                for j in range(length):
                    attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()]
                    attributes[i].frequency[j] = byteArray.nextInt()
            if not self.trie.load(byteArray, attributes) or byteArray.hasMore():
                return False
        except Exception as e:
            self.logger.warning("读取失败,问题发生在%s" % (str(e)))
            return False
        print ("核心词典加载成功%s,耗时%fms" % (path + Predefine.BIN_EXT, (time() - start) * 1000))
        return True

    @staticmethod
    def get1(key):
        """
        获取条目
        :param key:
        :return:
        """
        return CoreDictionary.trie.get(key)

    @staticmethod
    def get2(wordID):
        """
        获取条目
        :param wordID:
        :return:
        """
        return CoreDictionary.trie.get(wordID)

    def getTermFrequency(self, term):
        """
        获取词频
        :param term:
        :return:
        """
        attribute = self.get1(term)
        if attribute is None:
            return 0
        return attribute.totalFrequency

    def contains(self, key):
        """
        是否包含词语
        :param key:
        :return:
        """
        return self.trie.get(key) is not None
示例#7
0
 def __init__(self):
     self.trie = DoubleArrayTrie()
     self.logger = Predefine.logger
示例#8
0
class CommonDictionary(object, metaclass=ABCMeta):
    def __init__(self):
        self.trie = DoubleArrayTrie()
        self.logger = Predefine.logger

    def load(self, path):
        start = time()
        valueArray = self.onLoadValue(path)
        if valueArray is None:
            self.logger.warning("加载值%s.value.dat失败,耗时%fms" %
                                (path, (time() - start) * 1000))
            return False
        self.logger.info("加载值%s.value.dat成功,耗时%fms" %
                         (path, (time() - start) * 1000))
        print("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000))

        start = time()

        if self.loadDat(path + '.trie.dat', valueArray):
            self.logger.info("加载键%s.trie.dat成功,耗时%fms" %
                             (path, (time() - start) * 1000))
            print("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000))
            return True

        keyList = []

        try:
            br = open(path, 'r')
            while 1:
                line = br.readline().encode('utf-8').strip(' \n\t\r')
                if not line:
                    break
                paraArray = line.split(' ')
                keyList.append(paraArray[0])
        except Exception as e:
            self.logger.warning("读取%s失败%s" % (path, str(e)))
        resultcode = self.trie.kvbuild(keyList, valueArray)

        if resultcode != 0:
            self.logger.warning("trie建立失败%i,正在尝试排序后重载" % resultcode)
            initdict = {}
            map = None
            for i in range(len(list(valueArray))):
                initdict[keyList[i]] = valueArray[i]
            map = TreeMap(initdict).sort()
            self.trie.build(map)
            i = 0
            for v in map.values():
                valueArray[i] = v
                i += 1
        self.trie.save(path + '.trie.dat')
        self.logger.info(path + "加载成功")
        return True

    @abstractmethod
    def onLoadValue(self, path):
        """
        实现此方法来加载值
        :param path:
        :return:
        """
        pass

    @abstractmethod
    def onSaveValue(self, valueArray, path):
        """
        :param valueArray:
        :param path:
        :return:
        """
        pass

    def loadDat(self, path, valueArray):
        if self.trie.load1(path, valueArray):
            return True
        return False

    def get(self, key):
        """
         查询一个单词
        :param key:
        :return:单词对应的条目
        """

        return self.trie.get2(key)
示例#9
0
class CustomDictionary(object):
    # 用于储存用户动态插入词条的二分trie树
    trie = BinTrie()
    dat = DoubleArrayTrie()
    # 第一个是主词典,其他是副词典
    path = Config.CustomDictionaryPath

    start = time()

    def __init__(self):
        CustomDictionary.load()
        pass

    @staticmethod
    def load():
        """
        自动加载词典
        :param mainPath:
        :return:
        """
        start = time()
        if not CustomDictionary.loadMainDictionary(CustomDictionary.path[0]):
            Predefine.logger.warning("自定义词典%s加载失败" %
                                     (" ".join(CustomDictionary.path)))
        else:
            Predefine.logger.info("自定义词典加载成功:%i个词条,耗时%fms" %
                                  (CustomDictionary.dat.size,
                                   (time() - start) * 1000))
            print(
                "自定义词典%s加载成功:%i个词条,耗时%fms" %
                (" ".join(CustomDictionary.path), CustomDictionary.dat.size1(),
                 (time() - start) * 1000))
        return True

    @staticmethod
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print("自定义词典开始加载:%s" % mainPath)
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p))
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError as e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception as e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
        if map.size() == 0:
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = open(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        #IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True

    @staticmethod
    def loadtxt(path, defaultNature, map, customNatureCollector):
        """
        加载用户词典(追加)
        :param path: 词典路径
        :param defaultNature: 默认词性
        :param map:
        :param customNatureCollector: 收集用户词性
        :return:
        """
        try:
            initdict = OrderedDict()
            br = open(path, 'r', encoding='utf-8')
            while 1:
                line = br.readline().strip()
                if not line:
                    break
                param = line.split(" ")
                natureCount = (len(param) - 1) // 2
                attribute = None
                if natureCount == 0:
                    attribute = CoreDictionary.Attribute().init5(defaultNature)
                else:
                    attribute = CoreDictionary.Attribute().init1(natureCount)
                    for i in range(natureCount):
                        #for i in range(attribute):
                        attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                        attribute.frequency[i] = int(param[2 + 2 * i])
                        attribute.totalFrequency += attribute.frequency[i]
                initdict[param[0]] = attribute
            map = TreeMap(initdict)
        except Exception as e:
            Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e))
            return False, map
        return True, map

    @staticmethod
    def loadDat(path):
        """
        从磁盘加载双数组
        :param path:
        :return:
        """
        try:
            byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb'))
        except Exception as e:
            byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT)
            out = open(path + Predefine.PIC_EXT, 'wb')
            pickle.dump(byteArray, out)

        if byteArray is None:
            return False
        size = byteArray.nextInt()
        # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性
        if size < 0:
            pass
        attributes = [None] * size
        natureIndexArray = list(Nature)
        for i in range(size):
            # 第一个是全部词频,第二个是词性个数
            currentTotalFrequency = byteArray.nextInt()
            length = byteArray.nextInt()
            attributes[i] = CoreDictionary.Attribute().init1(length)
            attributes[i].totalFrequency = currentTotalFrequency
            for j in range(length):
                attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()]
                attributes[i].frequency[j] = byteArray.nextInt()
        if not CustomDictionary.dat.load(byteArray, attributes):
            return False

        return True

    @staticmethod
    def get(key):
        attribute = CustomDictionary.dat.get(key)
        if attribute is not None:
            return attribute
        if CustomDictionary.trie is None:
            return None
        return CustomDictionary.trie.get(key)