def writeCompact(self, articleFormat): """Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in articleFormat parameter. Parameters: articleFormat - format of article definition: h - html, m - plain text """ dictMark = 0 idxStr = '' dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) for i in xrange(len(self.glos.data)): item = self.glos.data[i] word, defi = item[:2] if len(item) > 2 and 'alts' in item[2]: alternates += [(x, i) for x in item[2]['alts']] dictStr += defi defiLen = len(defi) idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr( defiLen, 4) dictMark += defiLen with open(self.fileBasePath + '.dict', 'wb') as f: f.write(dictStr) with open(self.fileBasePath + '.idx', 'wb') as f: f.write(idxStr) indexFileSize = len(idxStr) del idxStr, dictStr self.writeSynFile(alternates) self.writeIfoFile(indexFileSize, len(alternates), articleFormat)
def writeCompact(self, articleFormat): """Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in articleFormat parameter. Parameters: articleFormat - format of article definition: h - html, m - plain text """ dictMark = 0 idxStr = '' dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) for i in xrange(len(self.glos.data)): item = self.glos.data[i] word, defi = item[:2] if len(item) > 2 and 'alts' in item[2]: alternates += [(x, i) for x in item[2]['alts']] dictStr += defi defiLen = len(defi) idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(defiLen, 4) dictMark += defiLen with open(self.fileBasePath+'.dict', 'wb') as f: f.write(dictStr) with open(self.fileBasePath+'.idx', 'wb') as f: f.write(idxStr) indexFileSize = len(idxStr) del idxStr, dictStr self.writeSynFile(alternates) self.writeIfoFile(indexFileSize, len(alternates), articleFormat)
def writeGeneral(self): """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 #idxStr = '' #dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) dictFp = open(self.fileBasePath + '.dict', 'wb') idxFp = open(self.fileBasePath + '.idx', 'wb') indexFileSize = 0 wordCount = 0 for i, entry in enumerate(self.glos): words = entry.getWords() ## list word = words[0] defis = entry.getDefis() ## list defiFormat = entry.getDefiFormat() if defiFormat not in ('m', 'h'): defiFormat = 'm' #assert isinstance(defiFormat, str) and len(defiFormat) == 1 dictBlock = b'' for altWord in words[1:]: alternates.append((altWord, i)) dictBlock += toBytes(defiFormat + defis[0]) + b'\x00' for altDefi in defis[1:]: dictBlock += toBytes(defiFormat + altDefi) + b'\x00' dictFp.write(dictBlock) dataLen = len(dictBlock) idxBlock = toBytes(word) + b'\x00' + intToBinStr( dictMark, 4) + intToBinStr(dataLen, 4) idxFp.write(toBytes(idxBlock)) dictMark += dataLen indexFileSize += len(idxBlock) wordCount += 1 dictFp.close() idxFp.close() self.writeSynFile(alternates) self.writeIfoFile(wordCount, indexFileSize, len(alternates))
def writeGeneral(self): """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 #idxStr = '' #dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) dictFp = open(self.fileBasePath+'.dict', 'wb') idxFp = open(self.fileBasePath+'.idx', 'wb') indexFileSize = 0 wordCount = 0 for i, entry in enumerate(self.glos): words = entry.getWords()## list word = words[0] defis = entry.getDefis()## list defiFormat = entry.getDefiFormat() if defiFormat not in ('m', 'h'): defiFormat = 'm' #assert isinstance(defiFormat, str) and len(defiFormat) == 1 dictBlock = b'' for altWord in words[1:]: alternates.append((altWord, i)) dictBlock += toBytes(defiFormat + defis[0]) + b'\x00' for altDefi in defis[1:]: dictBlock += toBytes(defiFormat + altDefi) + b'\x00' dictFp.write(dictBlock) dataLen = len(dictBlock) idxBlock = toBytes(word) + b'\x00' + intToBinStr(dictMark, 4) + intToBinStr(dataLen, 4) idxFp.write(toBytes(idxBlock)) dictMark += dataLen indexFileSize += len(idxBlock) wordCount += 1 dictFp.close() idxFp.close() self.writeSynFile(alternates) self.writeIfoFile(wordCount, indexFileSize, len(alternates))
def writeSynFile(self, altIndexList: List[Tuple[bytes, int]]) -> None: """ Build .syn file """ if not altIndexList: return log.info(f"Sorting {len(altIndexList)} synonyms...") t0 = now() altIndexList.sort(key=lambda x: sortKeyBytes(x[0])) # 28 seconds with old sort key (converted from custom cmp) # 0.63 seconds with my new sort key # 0.20 seconds without key function (default sort) log.info( f"Sorting {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", ) log.info(f"Writing {len(altIndexList)} synonyms...") t0 = now() with open(self._filename + ".syn", "wb") as synFile: synFile.write(b"".join([ b_alt + b"\x00" + intToBinStr(wordIndex, 4) for b_alt, wordIndex in altIndexList ])) log.info( f"Writing {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", )
def writeSynFile(self, altIndexList): """ Build .syn file """ if not altIndexList: return log.info("Sorting %s synonyms..." % len(altIndexList)) t0 = now() altIndexList.sort( key=lambda x: sortKeyBytes(x[0]) ) # 28 seconds with old sort key (converted from custom cmp) # 0.63 seconds with my new sort key # 0.20 seconds without key function (default sort) log.info("Sorting %s synonyms took %.2f seconds" % ( len(altIndexList), now() - t0, )) log.info("Writing %s synonyms..." % len(altIndexList)) t0 = now() with open(self._filename+".syn", "wb") as synFile: synFile.write(b"".join([ b_alt + b"\x00" + intToBinStr(wordIndex, 4) for b_alt, wordIndex in altIndexList ])) log.info("Writing %s synonyms took %.2f seconds" % ( len(altIndexList), now() - t0, ))
def writeGeneral(self): """Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 idxStr = '' dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) for i in xrange(len(self.glos.data)): item = self.glos.data[i] word, defi = item[:2] if len(item) > 2 and 'alts' in item[2]: alternates += [(x, i) for x in item[2]['alts']] if len(item) > 2 and 'defiFormat' in item[2]: articleFormat = item[2]['defiFormat'] if articleFormat not in 'mh': articleFormat = 'm' else: articleFormat = 'm' assert isinstance(articleFormat, str) and len(articleFormat) == 1 dictStr += articleFormat dictStr += defi + '\x00' dataLen = 1 + len(defi) + 1 if len(item) > 2 and 'defis' in item[2]: for rec in item[2]['defis']: defi, t = rec[:2] assert isinstance(t, str) and len(t) == 1 dictStr += t dictStr += defi + '\x00' dataLen += 1 + len(defi) + 1 idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr( dataLen, 4) dictMark += dataLen with open(self.fileBasePath + '.dict', 'wb') as f: f.write(dictStr) with open(self.fileBasePath + '.idx', 'wb') as f: f.write(idxStr) indexFileSize = len(idxStr) del idxStr, dictStr self.writeSynFile(alternates) self.writeIfoFile(indexFileSize, len(alternates))
def writeGeneral(self): """Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 idxStr = '' dictStr = '' alternates = [] # contains tuples ('alternate', index-of-word) for i in xrange(len(self.glos.data)): item = self.glos.data[i] word, defi = item[:2] if len(item) > 2 and 'alts' in item[2]: alternates += [(x, i) for x in item[2]['alts']] if len(item) > 2 and 'defiFormat' in item[2]: articleFormat = item[2]['defiFormat'] if articleFormat not in 'mh': articleFormat = 'm' else: articleFormat = 'm' assert isinstance(articleFormat, str) and len(articleFormat) == 1 dictStr += articleFormat dictStr += defi + '\x00' dataLen = 1 + len(defi) + 1 if len(item) > 2 and 'defis' in item[2]: for rec in item[2]['defis']: defi, t = rec[:2] assert isinstance(t, str) and len(t) == 1 dictStr += t dictStr += defi + '\x00' dataLen += 1 + len(defi) + 1 idxStr += word + '\x00' + intToBinStr(dictMark, 4) + intToBinStr(dataLen, 4) dictMark += dataLen with open(self.fileBasePath+'.dict', 'wb') as f: f.write(dictStr) with open(self.fileBasePath+'.idx', 'wb') as f: f.write(idxStr) indexFileSize = len(idxStr) del idxStr, dictStr self.writeSynFile(alternates) self.writeIfoFile(indexFileSize, len(alternates))
def writeSynFile(self, alternates): """Build .syn file """ if len(alternates) > 0: alternates.sort(stardict_strcmp, lambda x: x[0]) synStr = '' for item in alternates: synStr += item[0] + '\x00' + intToBinStr(item[1], 4) with open(self.fileBasePath + '.syn', 'wb') as f: f.write(synStr) del synStr
def writeSynFile(self, alternates): """Build .syn file """ if len(alternates) > 0: alternates.sort(stardict_strcmp, lambda x: x[0]) synStr = '' for item in alternates: synStr += item[0] + '\x00' + intToBinStr(item[1], 4) with open(self.fileBasePath+'.syn', 'wb') as f: f.write(synStr) del synStr
def writeSynFile(self, alternates): """ Build .syn file """ if len(alternates) > 0: alternates.sort(key=lambda x: sortKey(x[0])) synBytes = b'' for item in alternates: synBytes += toBytes(item[0]) + b'\x00' + intToBinStr(item[1], 4) with open(self.fileBasePath+'.syn', 'wb') as f: f.write(synBytes) del synBytes
def writeSynFile(self, alternates): """ Build .syn file """ if len(alternates) > 0: alternates.sort(key=lambda x: sortKey(x[0])) synBytes = b'' for item in alternates: synBytes += toBytes(item[0]) + b'\x00' + intToBinStr( item[1], 4) with open(self.fileBasePath + '.syn', 'wb') as f: f.write(synBytes) del synBytes
def writeGeneral(self) -> None: """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", wordIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryI = -1 for entry in self._glos: if entry.isData(): entry.save(self._resDir) continue entryI += 1 words = entry.getWords() # list of strs word = words[0] # str defis = entry.getDefis() # list of strs entry.detectDefiFormat() # call no more than once defiFormat = entry.getDefiFormat() defiFormatCounter[defiFormat] += 1 if defiFormat not in ("m", "h"): defiFormat = "m" b_dictBlock = b"" for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryI)) b_dictBlock += (defiFormat + defis[0]).encode("utf-8") + b"\x00" for altDefi in defis[1:]: b_dictBlock += (defiFormat + altDefi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ intToBinStr(dictMark, 4) + \ intToBinStr(blockLen, 4) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeSynFile(altIndexList) self.writeIfoFile(wordCount, indexFileSize, len(altIndexList))
def writeCompact(self, defiFormat): """ Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in defiFormat parameter. Parameters: defiFormat - format of article definition: h - html, m - plain text """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", wordIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 if not isdir(self._resDir): os.mkdir(self._resDir) entryI = -1 for entry in self._glos: if entry.isData(): entry.save(self._resDir) continue entryI += 1 words = entry.getWords() # list of strs word = words[0] # str defis = entry.getDefis() # list of strs b_dictBlock = b"" for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryI)) b_dictBlock += (defis[0]).encode("utf-8") for altDefi in defis[1:]: b_dictBlock += b"\x00" + (altDefi).encode("utf-8") dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ intToBinStr(dictMark, 4) + \ intToBinStr(blockLen, 4) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormat = " + pformat(defiFormat)) self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, indexFileSize, len(altIndexList), defiFormat, )
def writeGeneral(self) -> None: """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 entry.detectDefiFormat() # call no more than once defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat not in ("m", "h", "x"): log.error(f"invalid defiFormat={defiFormat}, using 'm'") defiFormat = "m" words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryIndex)) b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ intToBinStr(dictMark, 4) + \ intToBinStr(blockLen, 4) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, indexFileSize, len(altIndexList), )
def writeGeneral(self): """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", wordIndex) dictFile = open(self._filename+".dict", "wb") idxFile = open(self._filename+".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryI = -1 for entry in self._glos: if entry.isData(): entry.save(self._resDir) continue entryI += 1 words = entry.getWords() # list of strs word = words[0] # str defis = entry.getDefis() # list of strs entry.detectDefiFormat() # call no more than once defiFormat = entry.getDefiFormat() defiFormatCounter[defiFormat] += 1 if defiFormat not in ("m", "h"): defiFormat = "m" assert isinstance(defiFormat, str) and len(defiFormat) == 1 b_dictBlock = b"" for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryI)) b_dictBlock += (defiFormat + defis[0]).encode("utf-8") + b"\x00" for altDefi in defis[1:]: b_dictBlock += (defiFormat + altDefi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ intToBinStr(dictMark, 4) + \ intToBinStr(blockLen, 4) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info("Writing dict file took %.2f seconds", now() - t0) log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeSynFile(altIndexList) self.writeIfoFile(wordCount, indexFileSize, len(altIndexList))