def processAlternativeKey(self, b_word, b_key): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) # convert to unicode if self.strictStringConvertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( "processAlternativeKey(%s)\n" % b_word + "key = %s:\n" % b_key + "conversion error:\n%s" % excMessage() ) u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") # strip "/" before words u_word_main = re.sub( self.stripSlashAltKeyPattern, r"\1\2", u_word_main, ) if self.processHtmlInKey: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() u_word_main = u_word_main.rstrip(self.keyRStripChars) return u_word_main
def processKey(self, b_word): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) if strip_count > 1: log.debug( "processKey(%s):\n" % b_word + "number of dollar indexes = %s" % strip_count, ) # convert to unicode if self.strictStringConvertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( "processKey(%s):\n" % b_word + "conversion error:\n%s" % excMessage() ) u_word_main = b_word_main.decode( self.sourceEncoding, "ignore", ) else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") if self.processHtmlInKey: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() u_word_main = u_word_main.rstrip(self.keyRStripChars) return u_word_main
def processKey(self, b_word): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) if strip_count > 1: log.debug( f"processKey({b_word}):\n" f"number of dollar indexes = {strip_count}", ) # convert to unicode if self.strictStringConvertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( f"processKey({b_word}):\nconversion error:\n" + excMessage() ) u_word_main = b_word_main.decode( self.sourceEncoding, "ignore", ) else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") if self.processHtmlInKey: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() if self.keyRStripChars: u_word_main = u_word_main.rstrip(self.keyRStripChars) return u_word_main
def processAlternativeKey(self, b_word, b_key): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) # convert to unicode if self.strictStringConvertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( f"processAlternativeKey({b_word})\nkey = {b_key}" f":\nconversion error:\n" + excMessage() ) u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") # strip "/" before words u_word_main = re.sub( self.stripSlashAltKeyPattern, r"\1\2", u_word_main, ) if self.processHtmlInKey: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() u_word_main = u_word_main.rstrip(self.keyRStripChars) return u_word_main
def decodeCharsetTags(self, b_text, defaultEncoding): """ b_text is a bytes Decode html text taking into account charset tags and default encoding Return value: (u_text, defaultEncodingOnly) u_text is str defaultEncodingOnly parameter is false if the text contains parts encoded with non-default encoding (babylon character references '<CHARSET c="T">00E6;</CHARSET>' do not count). """ b_parts = re.split(charsetDecodePattern, b_text) u_text = "" encodings = [] # stack of encodings defaultEncodingOnly = True for i, b_part in enumerate(b_parts): if i % 3 == 0: # text block encoding = encodings[-1] if encodings else defaultEncoding b_text2 = b_part if encoding == "babylon-reference": b_refs = b_text2.split(b";") for i_ref, b_ref in enumerate(b_refs): if not b_ref: if i_ref != len(b_refs)-1: log.debug( "decoding charset tags" + ", b_text=%r\n" % b_text + "blank <charset c=t> character" + " reference (%r)\n" % b_text2 ) continue if not re.match(b"^[0-9a-fA-F]{4}$", b_ref): log.debug( "decoding charset tags, b_text=%r\n" % b_text + "invalid <charset c=t> character" + " reference (%r)\n" % b_text2 ) continue u_text += chr(int(b_ref, 16)) else: self.charReferencesStat(b_text2, encoding) if encoding == "cp1252": b_text2 = replaceAsciiCharRefs(b_text2, encoding) if self.strictStringConvertion: try: u_text2 = b_text2.decode(encoding) except UnicodeError: log.debug( "decoding charset tags" + ", b_text=%r" % b_text + "\nfragment: %r" % b_text2 + "\nconversion error:\n%s" % excMessage() ) u_text2 = text2.decode(encoding, "replace") else: u_text2 = b_text2.decode(encoding, "replace") u_text += u_text2 if encoding != defaultEncoding: defaultEncodingOnly = False elif i % 3 == 1: # <charset...> or </charset> if b_part.startswith(b"</"): # </charset> if encodings: encodings.pop() else: log.debug( "decoding charset tags, b_text=%r\n" % b_text + "unbalanced </charset> tag\n" ) else: # <charset c="?"> b_type = b_parts[i+1].lower() # b_type is a bytes instance, with length 1 if b_type == b"t": encodings.append("babylon-reference") elif b_type == b"u": encodings.append("utf-8") elif b_type == b"k": encodings.append(self.sourceEncoding) elif b_type == b"e": encodings.append(self.sourceEncoding) elif b_type == b"g": # gbk or gb18030 encoding # (not enough data to make distinction) encodings.append("gbk") else: log.debug( "decoding charset tags, text = %r\n" % b_text + "unknown charset code = %#.2x\n" % ord(b_type) ) # add any encoding to prevent # "unbalanced </charset> tag" error encodings.append(defaultEncoding) else: # c attribute of charset tag if the previous tag was charset pass if encodings: log.debug( "decoding charset tags, text=%s\n" % b_text + "unclosed <charset...> tag\n" ) return u_text, defaultEncodingOnly
def decodeCharsetTags(self, b_text, defaultEncoding): """ b_text is a bytes Decode html text taking into account charset tags and default encoding Return value: (u_text, defaultEncodingOnly) u_text is str defaultEncodingOnly parameter is false if the text contains parts encoded with non-default encoding (babylon character references '<CHARSET c="T">00E6;</CHARSET>' do not count). """ b_parts = re_charset_decode.split(b_text) u_text = "" encodings = [] # stack of encodings defaultEncodingOnly = True for i, b_part in enumerate(b_parts): if i % 3 == 0: # text block encoding = encodings[-1] if encodings else defaultEncoding b_text2 = b_part if encoding == "babylon-reference": b_refs = b_text2.split(b";") for i_ref, b_ref in enumerate(b_refs): if not b_ref: if i_ref != len(b_refs) - 1: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nblank <charset c=t> character" f" reference ({b_text2!r})\n" ) continue if not re_b_reference.match(b_ref): log.debug( f"decoding charset tags, b_text={b_text!r}" f"\ninvalid <charset c=t> character" f" reference ({b_text2!r})\n" ) continue u_text += chr(int(b_ref, 16)) else: self.charReferencesStat(b_text2, encoding) if encoding == "cp1252": b_text2 = replaceAsciiCharRefs(b_text2, encoding) if self.strictStringConvertion: try: u_text2 = b_text2.decode(encoding) except UnicodeError: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nfragment: {b_text2!r}" f"\nconversion error:\n" + excMessage() ) u_text2 = text2.decode(encoding, "replace") else: u_text2 = b_text2.decode(encoding, "replace") u_text += u_text2 if encoding != defaultEncoding: defaultEncodingOnly = False elif i % 3 == 1: # <charset...> or </charset> if b_part.startswith(b"</"): # </charset> if encodings: encodings.pop() else: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nunbalanced </charset> tag\n" ) else: # <charset c="?"> b_type = b_parts[i + 1].lower() # b_type is a bytes instance, with length 1 if b_type == b"t": encodings.append("babylon-reference") elif b_type == b"u": encodings.append("utf-8") elif b_type == b"k": encodings.append(self.sourceEncoding) elif b_type == b"e": encodings.append(self.sourceEncoding) elif b_type == b"g": # gbk or gb18030 encoding # (not enough data to make distinction) encodings.append("gbk") else: log.debug( f"decoding charset tags, text = {b_text!r}" f"\nunknown charset code = {ord(b_type):#02x}\n" ) # add any encoding to prevent # "unbalanced </charset> tag" error encodings.append(defaultEncoding) else: # c attribute of charset tag if the previous tag was charset pass if encodings: log.debug( f"decoding charset tags, text={b_text}" f"\nunclosed <charset...> tag\n" ) return u_text, defaultEncodingOnly