Exemplo n.º 1
0
 def generate():
     preType = 5
     preChar = 0
     typeList = []
     for i in range(65535):
         type = TextUtility.charType(i)
         if type != preType:
             array = [int()] * 3
             array[0] = preChar
             array[1] = i - 1
             array[2] = preType
             typeList.append(array)
         preChar = i
     array = [int()] * 3
     array[0] = preChar
     array[1] = 65535
     array[2] = preType
     typeList.append(array)
     out = sys.file(Config.CharTypePath, 'w+')
     for array in typeList:
         out.writelines(Convert.convert_char(array[0]))
         out.writelines(Convert.convert_char(array[1]))
         out.writelines(Convert.convert_byte(array[2]))
     out.close()
     byteArray = ByteArray.createByteArray(Config.CharTypePath)
     return byteArray
Exemplo n.º 2
0
 def save(self, path):
     try:
         out = open(path, 'w+')
         out.writelines(Convert.convert(self.size))
         for i in range(self.size):
             out.writelines(Convert.convert(self.base[i]))
             out.writelines(Convert.convert(self.check[i]))
         out.close()
     except Exception, e:
         return False
Exemplo n.º 3
0
    def load(self, path):
        self.logger.info("核心词典开始加载:%s" % path)
        print("核心词典开始加载:%s" % path)
        if self.loadDat(path):
            return True

        initdict = OrderedDict()
        try:
            f = open(path, 'r')
            line = ''
            MAX_FREQUENCY = 0
            start = time()
            while 1:
                line = f.readline().strip(' \n\t\r')
                if not line:
                    break
                param = line.split('\t')
                natureCount = int((len(param) - 1) / 2)
                attribute = CoreDictionary.Attribute().init1(natureCount)
                for i in range(natureCount):
                    attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                    attribute.frequency[i] = int(param[2 + 2 * i])
                    attribute.totalFrequency += attribute.frequency[i]
                initdict[param[0]] = attribute
                MAX_FREQUENCY += attribute.totalFrequency
            map = TreeMap(initdict)
            self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            self.trie.build(map)
            self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())
            print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())

            try:
                out = open(self.path + Predefine.BIN_EXT, 'w+')
                attributeList = map.values()
                out.writelines(Convert.convert(len(attributeList)))
                for attribute in attributeList:
                    out.writelines(Convert.convert(attribute.totalFrequency))
                    out.writelines(Convert.convert(len(attribute.nature)))
                    for i in range(len(attribute.nature)):
                        out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i])))
                        out.writelines(Convert.convert(attribute.frequency[i]))

                self.trie.save(out)
                out.close()
            except Exception as e:
                self.logger.warning("保存失败%s" % str(e))
                return False
        except IOError as e:
            self.logger.warning("核心词典%s不存在或读取错误!" % str(e))
            return False
        return True
    def saveDat(self, map):
        """
        保存bat到磁盘
        :param map:
        :return:
        """
        out = open(JapanesePersonDictionary.path + Predefine.VALUE_EXT, 'w+')
        out.writelines(Convert.convert(map.size()))
        for k, c in map.items():
            out.writelines(Convert.convert_char(ord(c)))
        out.close()

        return JapanesePersonDictionary.trie.save(JapanesePersonDictionary.path + Predefine.TRIE_EXT)
Exemplo n.º 5
0
 def save1(self, out):
     """
     将base和check保存下来
     :param out:
     :return:
     """
     try:
         out.writelines(Convert.convert(self.size))
         for i in range(self.size):
             out.writelines(Convert.convert(self.base[i]))
             out.writelines(Convert.convert(self.check[i]))
     except Exception, e:
         return False
Exemplo n.º 6
0
 def saveDat(self, path, valueArray):
     try:
         out = file(path, 'w+')
         out.writelines(Convert.convert(len(valueArray)))
         for item in valueArray:
             out.writelines(Convert.convert(len(item)))
             for entry in item:
                 out.writelines(Convert.convert(NR.ordinal(NR.valueOf(entry[0]))))
                 out.writelines(Convert.convert(int(entry[1])))
         out.close()
     except Exception, e:
         self.logger.warning("保存失败%s" % str(e))
         return False
Exemplo n.º 7
0
    def saveDat(self, path, valueArray):
        try:
            out = open(path, 'w+',encoding='utf-8')
            out.writelines(Convert.convert(len(valueArray)))
            for item in valueArray:
                out.writelines(Convert.convert(len(item)))
                for entry in item:
                    out.writelines(Convert.convert(NS.ordinal(NS.valueOf(entry[0]))))
                    out.writelines(Convert.convert(int(entry[1])))
            out.close()
        except Exception as e:
            self.logger.warning("保存失败%s" % str(e))
            return False

        return True
Exemplo n.º 8
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print("自定义词典开始加载:%s" % mainPath)
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p))
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError as e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception as e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
        if map.size() == 0:
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = open(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        #IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True
Exemplo n.º 9
0
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = file(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        # IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True

    @staticmethod
    def loadtxt(path, defaultNature, map, customNatureCollector):
        """
        加载用户词典(追加)
        :param path: 词典路径
        :param defaultNature: 默认词性
        :param map:
        :param customNatureCollector: 收集用户词性
Exemplo n.º 10
0
 def save(self, out):
     out.writelines(Convert.convert(self.totalFrequency))
     out.writelines(Convert.convert(len(self.nature)))
     for i in range(len(self.nature)):
         out.writelines(Convert.convert(Nature.ordinal(self.nature[i])))
         out.writelines(Convert.convert(self.frequency[i]))