示例#1
0
    def writeCompact(self, articleFormat):
        """Build StarDict dictionary with sametypesequence option specified.
        Every item definition consists of a single article.
        All articles have the same format, specified in articleFormat parameter.
        
        Parameters:
        articleFormat - format of article definition: h - html, m - plain text
        """
        dictMark = 0
        idxStr = ''
        dictStr = ''
        alternates = []  # contains tuples ('alternate', index-of-word)
        for i in xrange(len(self.glos.data)):
            item = self.glos.data[i]
            word, defi = item[:2]
            if len(item) > 2 and 'alts' in item[2]:
                alternates += [(x, i) for x in item[2]['alts']]
            dictStr += defi
            defiLen = len(defi)
            idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(
                defiLen, 4)
            dictMark += defiLen
        with open(self.fileBasePath + '.dict', 'wb') as f:
            f.write(dictStr)
        with open(self.fileBasePath + '.idx', 'wb') as f:
            f.write(idxStr)
        indexFileSize = len(idxStr)
        del idxStr, dictStr

        self.writeSynFile(alternates)
        self.writeIfoFile(indexFileSize, len(alternates), articleFormat)
示例#2
0
    def writeCompact(self, articleFormat):
        """Build StarDict dictionary with sametypesequence option specified.
        Every item definition consists of a single article.
        All articles have the same format, specified in articleFormat parameter.

        Parameters:
        articleFormat - format of article definition: h - html, m - plain text
        """
        dictMark = 0
        idxStr = ''
        dictStr = ''
        alternates = [] # contains tuples ('alternate', index-of-word)
        for i in xrange(len(self.glos.data)):
            item = self.glos.data[i]
            word, defi = item[:2]
            if len(item) > 2 and 'alts' in item[2]:
                alternates += [(x, i) for x in item[2]['alts']]
            dictStr += defi
            defiLen = len(defi)
            idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(defiLen, 4)
            dictMark += defiLen
        with open(self.fileBasePath+'.dict', 'wb') as f:
            f.write(dictStr)
        with open(self.fileBasePath+'.idx', 'wb') as f:
            f.write(idxStr)
        indexFileSize = len(idxStr)
        del idxStr, dictStr

        self.writeSynFile(alternates)
        self.writeIfoFile(indexFileSize, len(alternates), articleFormat)
示例#3
0
    def writeGeneral(self):
        """
            Build StarDict dictionary in general case.
            Every item definition may consist of an arbitrary number of articles.
            sametypesequence option is not used.
        """
        dictMark = 0
        #idxStr = ''
        #dictStr = ''
        alternates = []  # contains tuples ('alternate', index-of-word)

        dictFp = open(self.fileBasePath + '.dict', 'wb')
        idxFp = open(self.fileBasePath + '.idx', 'wb')
        indexFileSize = 0

        wordCount = 0
        for i, entry in enumerate(self.glos):

            words = entry.getWords()  ## list
            word = words[0]
            defis = entry.getDefis()  ## list

            defiFormat = entry.getDefiFormat()
            if defiFormat not in ('m', 'h'):
                defiFormat = 'm'
            #assert isinstance(defiFormat, str) and len(defiFormat) == 1

            dictBlock = b''

            for altWord in words[1:]:
                alternates.append((altWord, i))

            dictBlock += toBytes(defiFormat + defis[0]) + b'\x00'

            for altDefi in defis[1:]:
                dictBlock += toBytes(defiFormat + altDefi) + b'\x00'

            dictFp.write(dictBlock)

            dataLen = len(dictBlock)
            idxBlock = toBytes(word) + b'\x00' + intToBinStr(
                dictMark, 4) + intToBinStr(dataLen, 4)
            idxFp.write(toBytes(idxBlock))

            dictMark += dataLen
            indexFileSize += len(idxBlock)

            wordCount += 1

        dictFp.close()
        idxFp.close()

        self.writeSynFile(alternates)
        self.writeIfoFile(wordCount, indexFileSize, len(alternates))
示例#4
0
    def writeGeneral(self):
        """
            Build StarDict dictionary in general case.
            Every item definition may consist of an arbitrary number of articles.
            sametypesequence option is not used.
        """
        dictMark = 0
        #idxStr = ''
        #dictStr = ''
        alternates = [] # contains tuples ('alternate', index-of-word)

        dictFp = open(self.fileBasePath+'.dict', 'wb')
        idxFp = open(self.fileBasePath+'.idx', 'wb')
        indexFileSize = 0

        wordCount = 0
        for i, entry in enumerate(self.glos):

            words = entry.getWords()## list
            word = words[0]
            defis = entry.getDefis()## list

            defiFormat = entry.getDefiFormat()
            if defiFormat not in ('m', 'h'):
                defiFormat = 'm'
            #assert isinstance(defiFormat, str) and len(defiFormat) == 1

            dictBlock = b''
            
            for altWord in words[1:]:
                alternates.append((altWord, i))

            dictBlock += toBytes(defiFormat + defis[0]) + b'\x00'

            for altDefi in defis[1:]:
                dictBlock += toBytes(defiFormat + altDefi) + b'\x00'
            
            dictFp.write(dictBlock)
            
            dataLen = len(dictBlock)
            idxBlock = toBytes(word) + b'\x00' + intToBinStr(dictMark, 4) + intToBinStr(dataLen, 4)
            idxFp.write(toBytes(idxBlock))
            
            dictMark += dataLen
            indexFileSize += len(idxBlock)

            wordCount += 1

        dictFp.close()
        idxFp.close()

        self.writeSynFile(alternates)
        self.writeIfoFile(wordCount, indexFileSize, len(alternates))
示例#5
0
    def writeSynFile(self, altIndexList: List[Tuple[bytes, int]]) -> None:
        """
		Build .syn file
		"""
        if not altIndexList:
            return

        log.info(f"Sorting {len(altIndexList)} synonyms...")
        t0 = now()

        altIndexList.sort(key=lambda x: sortKeyBytes(x[0]))
        # 28 seconds with old sort key (converted from custom cmp)
        # 0.63 seconds with my new sort key
        # 0.20 seconds without key function (default sort)

        log.info(
            f"Sorting {len(altIndexList)} synonyms took {now()-t0:.2f} seconds",
        )
        log.info(f"Writing {len(altIndexList)} synonyms...")
        t0 = now()
        with open(self._filename + ".syn", "wb") as synFile:
            synFile.write(b"".join([
                b_alt + b"\x00" + intToBinStr(wordIndex, 4)
                for b_alt, wordIndex in altIndexList
            ]))
        log.info(
            f"Writing {len(altIndexList)} synonyms took {now()-t0:.2f} seconds",
        )
示例#6
0
	def writeSynFile(self, altIndexList):
		"""
		Build .syn file
		"""
		if not altIndexList:
			return

		log.info("Sorting %s synonyms..." % len(altIndexList))
		t0 = now()

		altIndexList.sort(
			key=lambda x: sortKeyBytes(x[0])
		)
		# 28 seconds with old sort key (converted from custom cmp)
		# 0.63 seconds with my new sort key
		# 0.20 seconds without key function (default sort)

		log.info("Sorting %s synonyms took %.2f seconds" % (
			len(altIndexList),
			now() - t0,
		))
		log.info("Writing %s synonyms..." % len(altIndexList))
		t0 = now()
		with open(self._filename+".syn", "wb") as synFile:
			synFile.write(b"".join([
				b_alt + b"\x00" + intToBinStr(wordIndex, 4)
				for b_alt, wordIndex in altIndexList
			]))
		log.info("Writing %s synonyms took %.2f seconds" % (
			len(altIndexList),
			now() - t0,
		))
示例#7
0
    def writeGeneral(self):
        """Build StarDict dictionary in general case.
        Every item definition may consist of an arbitrary number of articles.
        sametypesequence option is not used.
        """
        dictMark = 0
        idxStr = ''
        dictStr = ''
        alternates = []  # contains tuples ('alternate', index-of-word)
        for i in xrange(len(self.glos.data)):
            item = self.glos.data[i]
            word, defi = item[:2]
            if len(item) > 2 and 'alts' in item[2]:
                alternates += [(x, i) for x in item[2]['alts']]
            if len(item) > 2 and 'defiFormat' in item[2]:
                articleFormat = item[2]['defiFormat']
                if articleFormat not in 'mh':
                    articleFormat = 'm'
            else:
                articleFormat = 'm'
            assert isinstance(articleFormat, str) and len(articleFormat) == 1
            dictStr += articleFormat
            dictStr += defi + '\x00'
            dataLen = 1 + len(defi) + 1
            if len(item) > 2 and 'defis' in item[2]:
                for rec in item[2]['defis']:
                    defi, t = rec[:2]
                    assert isinstance(t, str) and len(t) == 1
                    dictStr += t
                    dictStr += defi + '\x00'
                    dataLen += 1 + len(defi) + 1
            idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(
                dataLen, 4)
            dictMark += dataLen
        with open(self.fileBasePath + '.dict', 'wb') as f:
            f.write(dictStr)
        with open(self.fileBasePath + '.idx', 'wb') as f:
            f.write(idxStr)
        indexFileSize = len(idxStr)
        del idxStr, dictStr

        self.writeSynFile(alternates)
        self.writeIfoFile(indexFileSize, len(alternates))
示例#8
0
    def writeGeneral(self):
        """Build StarDict dictionary in general case.
        Every item definition may consist of an arbitrary number of articles.
        sametypesequence option is not used.
        """
        dictMark = 0
        idxStr = ''
        dictStr = ''
        alternates = [] # contains tuples ('alternate', index-of-word)
        for i in xrange(len(self.glos.data)):
            item = self.glos.data[i]
            word, defi = item[:2]
            if len(item) > 2 and 'alts' in item[2]:
                alternates += [(x, i) for x in item[2]['alts']]
            if len(item) > 2 and 'defiFormat' in item[2]:
                articleFormat = item[2]['defiFormat']
                if articleFormat not in 'mh':
                    articleFormat = 'm'
            else:
                articleFormat = 'm'
            assert isinstance(articleFormat, str) and len(articleFormat) == 1
            dictStr += articleFormat
            dictStr += defi + '\x00'
            dataLen = 1 + len(defi) + 1
            if len(item) > 2 and 'defis' in item[2]:
                for rec in item[2]['defis']:
                    defi, t = rec[:2]
                    assert isinstance(t, str) and len(t) == 1
                    dictStr += t
                    dictStr += defi + '\x00'
                    dataLen += 1 + len(defi) + 1
            idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(dataLen, 4)
            dictMark += dataLen
        with open(self.fileBasePath+'.dict', 'wb') as f:
            f.write(dictStr)
        with open(self.fileBasePath+'.idx', 'wb') as f:
            f.write(idxStr)
        indexFileSize = len(idxStr)
        del idxStr, dictStr

        self.writeSynFile(alternates)
        self.writeIfoFile(indexFileSize, len(alternates))
示例#9
0
 def writeSynFile(self, alternates):
     """Build .syn file
     """
     if len(alternates) > 0:
         alternates.sort(stardict_strcmp, lambda x: x[0])
         synStr = ''
         for item in alternates:
             synStr += item[0] + '\x00' + intToBinStr(item[1], 4)
         with open(self.fileBasePath + '.syn', 'wb') as f:
             f.write(synStr)
         del synStr
示例#10
0
 def writeSynFile(self, alternates):
     """Build .syn file
     """
     if len(alternates) > 0:
         alternates.sort(stardict_strcmp, lambda x: x[0])
         synStr = ''
         for item in alternates:
             synStr += item[0] + '\x00' + intToBinStr(item[1], 4)
         with open(self.fileBasePath+'.syn', 'wb') as f:
             f.write(synStr)
         del synStr
示例#11
0
 def writeSynFile(self, alternates):
     """
         Build .syn file
     """
     if len(alternates) > 0:
         alternates.sort(key=lambda x: sortKey(x[0]))
         synBytes = b''
         for item in alternates:
             synBytes += toBytes(item[0]) + b'\x00' + intToBinStr(item[1], 4)
         with open(self.fileBasePath+'.syn', 'wb') as f:
             f.write(synBytes)
         del synBytes
示例#12
0
 def writeSynFile(self, alternates):
     """
         Build .syn file
     """
     if len(alternates) > 0:
         alternates.sort(key=lambda x: sortKey(x[0]))
         synBytes = b''
         for item in alternates:
             synBytes += toBytes(item[0]) + b'\x00' + intToBinStr(
                 item[1], 4)
         with open(self.fileBasePath + '.syn', 'wb') as f:
             f.write(synBytes)
         del synBytes
示例#13
0
    def writeGeneral(self) -> None:
        """
		Build StarDict dictionary in general case.
		Every item definition may consist of an arbitrary number of articles.
		sametypesequence option is not used.
		"""
        dictMark = 0
        altIndexList = []  # list of tuples (b"alternate", wordIndex)

        dictFile = open(self._filename + ".dict", "wb")
        idxFile = open(self._filename + ".idx", "wb")
        indexFileSize = 0

        t0 = now()
        wordCount = 0
        defiFormatCounter = Counter()
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryI = -1
        for entry in self._glos:
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryI += 1

            words = entry.getWords()  # list of strs
            word = words[0]  # str
            defis = entry.getDefis()  # list of strs

            entry.detectDefiFormat()  # call no more than once
            defiFormat = entry.getDefiFormat()
            defiFormatCounter[defiFormat] += 1
            if defiFormat not in ("m", "h"):
                defiFormat = "m"

            b_dictBlock = b""

            for alt in words[1:]:
                altIndexList.append((alt.encode("utf-8"), entryI))

            b_dictBlock += (defiFormat + defis[0]).encode("utf-8") + b"\x00"

            for altDefi in defis[1:]:
                b_dictBlock += (defiFormat + altDefi).encode("utf-8") + b"\x00"

            dictFile.write(b_dictBlock)

            blockLen = len(b_dictBlock)
            b_idxBlock = word.encode("utf-8") + b"\x00" + \
             intToBinStr(dictMark, 4) + \
             intToBinStr(blockLen, 4)
            idxFile.write(b_idxBlock)

            dictMark += blockLen
            indexFileSize += len(b_idxBlock)

            wordCount += 1

        dictFile.close()
        idxFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")
        log.debug("defiFormatsCount = " +
                  pformat(defiFormatCounter.most_common()))

        self.writeSynFile(altIndexList)
        self.writeIfoFile(wordCount, indexFileSize, len(altIndexList))
示例#14
0
    def writeCompact(self, defiFormat):
        """
		Build StarDict dictionary with sametypesequence option specified.
		Every item definition consists of a single article.
		All articles have the same format, specified in defiFormat parameter.

		Parameters:
		defiFormat - format of article definition: h - html, m - plain text
		"""
        dictMark = 0
        altIndexList = []  # list of tuples (b"alternate", wordIndex)

        dictFile = open(self._filename + ".dict", "wb")
        idxFile = open(self._filename + ".idx", "wb")
        indexFileSize = 0

        t0 = now()
        wordCount = 0
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryI = -1
        for entry in self._glos:
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryI += 1

            words = entry.getWords()  # list of strs
            word = words[0]  # str
            defis = entry.getDefis()  # list of strs

            b_dictBlock = b""

            for alt in words[1:]:
                altIndexList.append((alt.encode("utf-8"), entryI))

            b_dictBlock += (defis[0]).encode("utf-8")

            for altDefi in defis[1:]:
                b_dictBlock += b"\x00" + (altDefi).encode("utf-8")

            dictFile.write(b_dictBlock)

            blockLen = len(b_dictBlock)
            b_idxBlock = word.encode("utf-8") + b"\x00" + \
             intToBinStr(dictMark, 4) + \
             intToBinStr(blockLen, 4)
            idxFile.write(b_idxBlock)

            dictMark += blockLen
            indexFileSize += len(b_idxBlock)

            wordCount += 1

        dictFile.close()
        idxFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")
        log.debug("defiFormat = " + pformat(defiFormat))

        self.writeSynFile(altIndexList)
        self.writeIfoFile(
            wordCount,
            indexFileSize,
            len(altIndexList),
            defiFormat,
        )
示例#15
0
    def writeGeneral(self) -> None:
        """
		Build StarDict dictionary in general case.
		Every item definition may consist of an arbitrary number of articles.
		sametypesequence option is not used.
		"""
        dictMark = 0
        altIndexList = []  # list of tuples (b"alternate", entryIndex)

        dictFile = open(self._filename + ".dict", "wb")
        idxFile = open(self._filename + ".idx", "wb")
        indexFileSize = 0

        t0 = now()
        wordCount = 0
        defiFormatCounter = Counter()
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryIndex = -1
        while True:
            entry = yield
            if entry is None:
                break
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryIndex += 1

            entry.detectDefiFormat()  # call no more than once
            defiFormat = entry.defiFormat
            defiFormatCounter[defiFormat] += 1
            if defiFormat not in ("m", "h", "x"):
                log.error(f"invalid defiFormat={defiFormat}, using 'm'")
                defiFormat = "m"

            words = entry.l_word  # list of strs
            word = words[0]  # str
            defi = self.fixDefi(entry.defi, defiFormat)
            # defi is str

            for alt in words[1:]:
                altIndexList.append((alt.encode("utf-8"), entryIndex))

            b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00"
            dictFile.write(b_dictBlock)
            blockLen = len(b_dictBlock)

            b_idxBlock = word.encode("utf-8") + b"\x00" + \
             intToBinStr(dictMark, 4) + \
             intToBinStr(blockLen, 4)
            idxFile.write(b_idxBlock)

            dictMark += blockLen
            indexFileSize += len(b_idxBlock)

            wordCount += 1

        dictFile.close()
        idxFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")
        log.debug("defiFormatsCount = " +
                  pformat(defiFormatCounter.most_common()))

        self.writeSynFile(altIndexList)
        self.writeIfoFile(
            wordCount,
            indexFileSize,
            len(altIndexList),
        )
示例#16
0
	def writeGeneral(self):
		"""
		Build StarDict dictionary in general case.
		Every item definition may consist of an arbitrary number of articles.
		sametypesequence option is not used.
		"""
		dictMark = 0
		altIndexList = []  # list of tuples (b"alternate", wordIndex)

		dictFile = open(self._filename+".dict", "wb")
		idxFile = open(self._filename+".idx", "wb")
		indexFileSize = 0

		t0 = now()
		wordCount = 0
		defiFormatCounter = Counter()
		if not isdir(self._resDir):
			os.mkdir(self._resDir)

		entryI = -1
		for entry in self._glos:
			if entry.isData():
				entry.save(self._resDir)
				continue
			entryI += 1

			words = entry.getWords()  # list of strs
			word = words[0]  # str
			defis = entry.getDefis()  # list of strs

			entry.detectDefiFormat()  # call no more than once
			defiFormat = entry.getDefiFormat()
			defiFormatCounter[defiFormat] += 1
			if defiFormat not in ("m", "h"):
				defiFormat = "m"
			assert isinstance(defiFormat, str) and len(defiFormat) == 1

			b_dictBlock = b""

			for alt in words[1:]:
				altIndexList.append((alt.encode("utf-8"), entryI))

			b_dictBlock += (defiFormat + defis[0]).encode("utf-8") + b"\x00"

			for altDefi in defis[1:]:
				b_dictBlock += (defiFormat + altDefi).encode("utf-8") + b"\x00"

			dictFile.write(b_dictBlock)

			blockLen = len(b_dictBlock)
			b_idxBlock = word.encode("utf-8") + b"\x00" + \
				intToBinStr(dictMark, 4) + \
				intToBinStr(blockLen, 4)
			idxFile.write(b_idxBlock)

			dictMark += blockLen
			indexFileSize += len(b_idxBlock)

			wordCount += 1

		dictFile.close()
		idxFile.close()
		if not os.listdir(self._resDir):
			os.rmdir(self._resDir)
		log.info("Writing dict file took %.2f seconds", now() - t0)
		log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common()))

		self.writeSynFile(altIndexList)
		self.writeIfoFile(wordCount, indexFileSize, len(altIndexList))