def readEntryDefi(self, block, pos, b_word): """ Read defi part of entry. Return value is a list. (False, None, None, None) if error (True, pos, u_defi, b_defi) if OK u_defi is a str instance (utf-8) b_defi is a bytes instance """ Err = (False, None, None, None) if pos + 2 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 2 > len(block.data)" ) return Err Len = uintFromBytes(block.data[pos:pos + 2]) pos += 2 if pos + Len > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + Len > len(block.data)" ) return Err b_defi = block.data[pos:pos + Len] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += Len return True, pos, u_defi, b_defi
def openGzip(self): with open(self._filename, "rb") as bglFile: if not bglFile: log.error(f"file pointer empty: {bglFile}") return False b_head = bglFile.read(6) if len(b_head) < 6 or not b_head[:4] in ( b"\x12\x34\x00\x01", b"\x12\x34\x00\x02", ): log.error(f"invalid header: {b_head[:6]!r}") return False self.gzipOffset = gzipOffset = uintFromBytes(b_head[4:6]) log.debug(f"Position of gz header: {gzipOffset}") if gzipOffset < 6: log.error(f"invalid gzip header position: {gzipOffset}") return False self.file = BGLGzipFile( fileobj=FileOffS(self._filename, gzipOffset), closeFileobj=True, ) return True
def languageInfoDecode(b_value): """ returns BabylonLanguage instance """ intValue = uintFromBytes(b_value) try: return languageByCode[intValue] except IndexError: log.warning(f"read_type_3: unknown language code = {intValue}") return
def readType0(self, block): code = block.data[0] if code == 2: # this number is vary close to self.bgl_numEntries, # but does not always equal to the number of entries # see self.readType3, code == 12 as well num = uintFromBytes(block.data[1:]) elif code == 8: self.defaultCharset = charsetInfoDecode(block.data[1:]) if not self.defaultCharset: log.warning("defaultCharset is not valid") else: self.logUnknownBlock(block) return False return True
def readType3(self, block): """ reads block with type 3, and updates self.info returns None """ code, b_value = uintFromBytes(block.data[:2]), block.data[2:] if not b_value: return # if not b_value.strip(b"\x00"): return # FIXME try: item = infoType3ByCode[code] except KeyError: if b_value.strip(b"\x00"): log.debug( f"Unknown info type code={code:#02x}, b_value={b_value!r}", ) return key = item.name decode = item.decode if key.endswith(".ico"): self.iconDataList.append((key, b_value)) return value = None if decode is None: value = b_value else: value = decode(b_value) # `value` can be None, str, bytes or dict if not value: return if isinstance(value, dict): self.info.update(value) return if item.attr: setattr(self, key, value) return self.info[key] = value
def readBytes(self, num): """ return -1 if error """ if num < 1 or num > 4: log.error(f"invalid argument num={num}") return -1 self.file.flush() buf = self.file.read(num) if len(buf) == 0: log.debug("readBytes: end of file: len(buf)==0") return -1 if len(buf) != num: log.error( f"readBytes: expected to read {num} bytes" f", but found {len(buf)} bytes" ) return -1 return uintFromBytes(buf)
def flagsInfoDecode(b_value): """ returns a dict with these keys: utf8Encoding when this flag is set utf8 encoding is used for all articles when false, the encoding is set according to the source and target alphabet bgl_spellingAlternatives determines whether the glossary offers spelling alternatives for searched terms bgl_caseSensitive defines if the search for terms in this glossary is case sensitive see code 0x20 as well """ flags = uintFromBytes(b_value) return { "utf8Encoding": (flags & 0x8000 != 0), "bgl_spellingAlternatives": (flags & 0x10000 == 0), "bgl_caseSensitive": (flags & 0x1000 != 0), }
def utf16InfoDecode(b_value): """ b_value is byte array returns str, or None (on errors) block type = 3 block format: <2 byte code1><2 byte code2> if code2 == 0: then the block ends if code2 == 1: then the block continues as follows: <4 byte len1> \x00 \x00 <message in utf-16> len1 - length of message in 2-byte chars """ if b_value[0] != 0: log.warning( f"utf16InfoDecode: b_value={b_value}, null expected at 0", ) return if b_value[1] == 0: if len(b_value) > 2: log.warning( f"utf16InfoDecode: unexpected b_value size: {len(b_value)}", ) return elif b_value[1] > 1: log.warning( f"utf16InfoDecode: b_value={b_value!r}, unexpected byte at 1", ) return # now b_value[1] == 1 size = 2 * uintFromBytes(b_value[2:6]) if tuple(b_value[6:8]) != (0, 0): log.warning( f"utf16InfoDecode: b_value={b_value!r}, null expected at 6:8", ) if size != len(b_value) - 8: log.warning( f"utf16InfoDecode: b_value={b_value!r}, size does not match", ) return b_value[8:].decode("utf16") # str
def decodeBglBinTime(b_value): jd1970 = gregorian.to_jd(1970, 1, 1) djd, hm = divmod(uintFromBytes(b_value), 24 * 60) year, month, day = gregorian.jd_to(djd + jd1970) hour, minute = divmod(hm, 60) return f"{year:04d}/{month:02d}/{day:02d}, {hour:02d}:{minute:02d}"
def readEntry_Type11(self, block): """return (succeed, u_word, u_alts, u_defi)""" Err = (False, None, None, None) pos = 0 # reading headword if pos + 5 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading word size: pos + 5 > len(block.data)" ) return Err wordLen = uintFromBytes(block.data[pos:pos + 5]) pos += 5 if pos + wordLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading word: pos + wordLen > len(block.data)" ) return Err b_word = block.data[pos:pos + wordLen] u_word = self.processKey(b_word) pos += wordLen self.wordLenMax = max(self.wordLenMax, len(u_word)) # reading alts and defi if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 4 > len(block.data)" ) return Err altsCount = uintFromBytes(block.data[pos:pos + 4]) pos += 4 # reading alts # use set instead of list to prevent duplicates u_alts = set() for altIndex in range(altsCount): if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + 4 > len(block.data)" ) return Err altLen = uintFromBytes(block.data[pos:pos + 4]) pos += 4 if altLen == 0: if pos + altLen != len(block.data): # no evidence log.warning( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + altLen != len(block.data)" ) break if pos + altLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading alt: pos + altLen > len(block.data)" ) return Err b_alt = block.data[pos:pos + altLen] u_alt = self.processAlternativeKey(b_alt, b_word) # Like entry key, alt is not processed as html by babylon, # so do we. u_alts.add(u_alt) pos += altLen if u_word in u_alts: u_alts.remove(u_word) u_alts = list(sorted(u_alts)) # reading defi defiLen = uintFromBytes(block.data[pos:pos + 4]) pos += 4 if pos + defiLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + defiLen > len(block.data)" ) return Err b_defi = block.data[pos:pos + defiLen] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += defiLen return True, u_word, u_alts, u_defi
def collectDefiFields(self, b_defi, b_key, fields): """ entry definition structure: <main definition>['\x14'[{field_code}{field_data}]*] {field_code} is one character {field_data} has arbitrary length """ # d0 is index of the '\x14 char in b_defi # d0 may be the last char of the string d0 = self.findDefiFieldsStart(b_defi) if d0 == -1: fields.b_defi = b_defi return fields.b_defi = b_defi[:d0] i = d0 + 1 while i < len(b_defi): if self.metadata2: self.metadata2.defiTrailingFields[b_defi[i]] += 1 if b_defi[i] == 0x02: # part of speech # "\x02" <one char - part of speech> if fields.partOfSpeech: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nduplicate part of speech item", ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x02" ) return posCode = b_defi[i + 1] try: fields.partOfSpeech = partOfSpeechByCode[posCode] except KeyError: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown part of speech code = {posCode:#02x}" ) return i += 2 elif b_defi[i] == 0x06: # \x06<one byte> if fields.b_field_06: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nduplicate type 6" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x06" ) return fields.b_field_06 = b_defi[i + 1] i += 2 elif b_defi[i] == 0x07: # \x07<two bytes> # Found in 4 Hebrew dictionaries. I do not understand. if i + 3 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x07" ) return fields.b_field_07 = b_defi[i + 1:i + 3] i += 3 elif b_defi[i] == 0x13: # "\x13"<one byte - length><data> # known values: # 03 06 0D C7 # 04 00 00 00 44 # ... # 04 00 00 00 5F if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x13" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nblank data after \\x13" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" + f"b_key = {b_key!r}:\ntoo few data after \\x13" ) return fields.b_field_13 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x18: # \x18<one byte - title length><entry title> if fields.b_title: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"b_key = {b_key!r}:\nduplicate entry title item" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nb_defi ends after \\x18" ) return i += 1 Len = b_defi[i] i += 1 if Len == 0: # log.debug( # f"collecting definition fields, b_defi = {b_defi!r}\n" # f"b_key = {b_key!r}:\nblank entry title" # ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\ntitle is too long" ) return fields.b_title = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x1a: # "\x1a"<one byte - length><text> # found only in Hebrew dictionaries, I do not understand. if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key}:\ntoo few data after \\x1a" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x1a" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x1a" ) return fields.b_field_1a = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x28: # "\x28" <two bytes - length><html text> # title with transcription? if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return i += 1 Len = uintFromBytes(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x28" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return fields.b_title_trans = b_defi[i:i + Len] i += Len elif 0x40 <= b_defi[i] <= 0x4f: # [\x41-\x4f] <one byte> <text> # often contains digits as text: # 56 # ælps - key Alps # 48@i # has no apparent influence on the article code = b_defi[i] Len = b_defi[i] - 0x3f if i + 2 + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x40+" ) return i += 2 b_text = b_defi[i:i + Len] i += Len log.debug( f"unknown definition field {code:#02x}, b_text={b_text!r}" ) elif b_defi[i] == 0x50: # \x50 <one byte> <one byte - length><data> if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.code_transcription_50 = b_defi[i + 1] Len = b_defi[i + 2] i += 3 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x50" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.b_transcription_50 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x60: # "\x60" <one byte> <two bytes - length> <text> if i + 4 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.code_transcription_60 = b_defi[i + 1] i += 2 Len = uintFromBytes(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x60" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" + f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.b_transcription_60 = b_defi[i:i + Len] i += Len else: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown control char. Char code = {b_defi[i]:#02x}" ) return