def readIdxFile(self) -> List[Tuple[bytes, int, int]]: if isfile(self._filename + ".idx.gz"): with gzip.open(self._filename + ".idx.gz") as idxFile: idxBytes = idxFile.read() else: with open(self._filename + ".idx", "rb") as idxFile: idxBytes = idxFile.read() indexData = [] pos = 0 while pos < len(idxBytes): beg = pos pos = idxBytes.find(b"\x00", beg) if pos < 0: log.error("Index file is corrupted") break b_word = idxBytes[beg:pos] pos += 1 if pos + 8 > len(idxBytes): log.error("Index file is corrupted") break offset = uint32FromBytes(idxBytes[pos:pos + 4]) pos += 4 size = uint32FromBytes(idxBytes[pos:pos + 4]) pos += 4 indexData.append((b_word, offset, size)) return indexData
def parseDefiBlockGeneral(self, b_block: bytes) -> List[Tuple[bytes, int]]: """ Parse definition block when sametypesequence option is not specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ res = [] i = 0 while i < len(b_block): t = b_block[i] if not bytes([t]).isalpha(): return None i += 1 if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = uint32FromBytes(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size return res
def parseDefiBlockCompact( self, b_block: bytes, sametypesequence: str, ) -> List[Tuple[bytes, int]]: """ Parse definition block when sametypesequence option is specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ b_sametypesequence = sametypesequence.encode("utf-8") assert len(b_sametypesequence) > 0 res = [] i = 0 for t in b_sametypesequence[:-1]: if i >= len(b_block): return None if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = uint32FromBytes(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size if i >= len(b_block): return None t = b_sametypesequence[-1] if bytes([t]).islower(): if 0 in b_block[i:]: return None res.append((b_block[i:], t)) else: assert bytes([t]).isupper() res.append((b_block[i:], t)) return res
def readSynFile(self) -> "Dict[int, List[str]]": """ return synDict, a dict { entryIndex -> altList } """ if not isfile(self._filename + ".syn"): return {} unicode_errors = self._unicode_errors with open(self._filename + ".syn", "rb") as synFile: synBytes = synFile.read() synBytesLen = len(synBytes) synDict = {} pos = 0 while pos < synBytesLen: beg = pos pos = synBytes.find(b"\x00", beg) if pos < 0: log.error("Synonym file is corrupted") break b_alt = synBytes[beg:pos] # b_alt is bytes pos += 1 if pos + 4 > len(synBytes): log.error("Synonym file is corrupted") break entryIndex = uint32FromBytes(synBytes[pos:pos + 4]) pos += 4 if entryIndex >= self._wordCount: log.error(f"Corrupted synonym file. " + f"Word {b_alt} references invalid item") continue s_alt = b_alt.decode("utf-8", errors=unicode_errors) # s_alt is str try: synDict[entryIndex].append(s_alt) except KeyError: synDict[entryIndex] = [s_alt] return synDict