Exemplo n.º 1
0
	def processAlternativeKey(self, b_word, b_key):
		"""
			b_word is a bytes instance
			returns u_word_main, as str instance (utf-8 encoding)
		"""
		b_word_main, strip_count = stripDollarIndexes(b_word)
		# convert to unicode
		if self.strictStringConvertion:
			try:
				u_word_main = b_word_main.decode(self.sourceEncoding)
			except UnicodeError:
				log.debug(
					"processAlternativeKey(%s)\n" % b_word +
					"key = %s:\n" % b_key +
					"conversion error:\n%s" % excMessage()
				)
				u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")
		else:
			u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")

		# strip "/" before words
		u_word_main = re.sub(
			self.stripSlashAltKeyPattern,
			r"\1\2",
			u_word_main,
		)

		if self.processHtmlInKey:
			# u_word_main_orig = u_word_main
			u_word_main = stripHtmlTags(u_word_main)
			u_word_main = replaceHtmlEntriesInKeys(u_word_main)
#			if(re.match(".*[&<>].*", u_word_main_orig)):
#				log.debug("original text: " + u_word_main_orig + "\n" \
#						+ "new      text: " + u_word_main + "\n")
		u_word_main = removeControlChars(u_word_main)
		u_word_main = removeNewlines(u_word_main)
		u_word_main = u_word_main.lstrip()
		u_word_main = u_word_main.rstrip(self.keyRStripChars)
		return u_word_main
Exemplo n.º 2
0
	def processKey(self, b_word):
		"""
			b_word is a bytes instance
			returns u_word_main, as str instance (utf-8 encoding)
		"""
		b_word_main, strip_count = stripDollarIndexes(b_word)
		if strip_count > 1:
			log.debug(
				"processKey(%s):\n" % b_word +
				"number of dollar indexes = %s" % strip_count,
			)
		# convert to unicode
		if self.strictStringConvertion:
			try:
				u_word_main = b_word_main.decode(self.sourceEncoding)
			except UnicodeError:
				log.debug(
					"processKey(%s):\n" % b_word +
					"conversion error:\n%s" % excMessage()
				)
				u_word_main = b_word_main.decode(
					self.sourceEncoding,
					"ignore",
				)
		else:
			u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")

		if self.processHtmlInKey:
			# u_word_main_orig = u_word_main
			u_word_main = stripHtmlTags(u_word_main)
			u_word_main = replaceHtmlEntriesInKeys(u_word_main)
#			if(re.match(".*[&<>].*", u_word_main_orig)):
#				log.debug("original text: " + u_word_main_orig + "\n" \
#						  + "new      text: " + u_word_main + "\n")
		u_word_main = removeControlChars(u_word_main)
		u_word_main = removeNewlines(u_word_main)
		u_word_main = u_word_main.lstrip()
		u_word_main = u_word_main.rstrip(self.keyRStripChars)
		return u_word_main
Exemplo n.º 3
0
	def processKey(self, b_word):
		"""
			b_word is a bytes instance
			returns u_word_main, as str instance (utf-8 encoding)
		"""
		b_word_main, strip_count = stripDollarIndexes(b_word)
		if strip_count > 1:
			log.debug(
				f"processKey({b_word}):\n"
				f"number of dollar indexes = {strip_count}",
			)
		# convert to unicode
		if self.strictStringConvertion:
			try:
				u_word_main = b_word_main.decode(self.sourceEncoding)
			except UnicodeError:
				log.debug(
					f"processKey({b_word}):\nconversion error:\n" + excMessage()
				)
				u_word_main = b_word_main.decode(
					self.sourceEncoding,
					"ignore",
				)
		else:
			u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")

		if self.processHtmlInKey:
			# u_word_main_orig = u_word_main
			u_word_main = stripHtmlTags(u_word_main)
			u_word_main = replaceHtmlEntriesInKeys(u_word_main)
#			if(re.match(".*[&<>].*", u_word_main_orig)):
#				log.debug("original text: " + u_word_main_orig + "\n" \
#						  + "new      text: " + u_word_main + "\n")
		u_word_main = removeControlChars(u_word_main)
		u_word_main = removeNewlines(u_word_main)
		u_word_main = u_word_main.lstrip()
		if self.keyRStripChars:
			u_word_main = u_word_main.rstrip(self.keyRStripChars)
		return u_word_main
Exemplo n.º 4
0
	def processAlternativeKey(self, b_word, b_key):
		"""
			b_word is a bytes instance
			returns u_word_main, as str instance (utf-8 encoding)
		"""
		b_word_main, strip_count = stripDollarIndexes(b_word)
		# convert to unicode
		if self.strictStringConvertion:
			try:
				u_word_main = b_word_main.decode(self.sourceEncoding)
			except UnicodeError:
				log.debug(
					f"processAlternativeKey({b_word})\nkey = {b_key}"
					f":\nconversion error:\n" + excMessage()
				)
				u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")
		else:
			u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")

		# strip "/" before words
		u_word_main = re.sub(
			self.stripSlashAltKeyPattern,
			r"\1\2",
			u_word_main,
		)

		if self.processHtmlInKey:
			# u_word_main_orig = u_word_main
			u_word_main = stripHtmlTags(u_word_main)
			u_word_main = replaceHtmlEntriesInKeys(u_word_main)
#			if(re.match(".*[&<>].*", u_word_main_orig)):
#				log.debug("original text: " + u_word_main_orig + "\n" \
#						+ "new      text: " + u_word_main + "\n")
		u_word_main = removeControlChars(u_word_main)
		u_word_main = removeNewlines(u_word_main)
		u_word_main = u_word_main.lstrip()
		u_word_main = u_word_main.rstrip(self.keyRStripChars)
		return u_word_main
Exemplo n.º 5
0
	def decodeCharsetTags(self, b_text, defaultEncoding):
		"""
		b_text is a bytes
		Decode html text taking into account charset tags and default encoding

		Return value: (u_text, defaultEncodingOnly)
		u_text is str
		defaultEncodingOnly parameter is false if the text contains parts
		encoded with non-default encoding (babylon character references
		'<CHARSET c="T">00E6;</CHARSET>' do not count).
		"""
		b_parts = re.split(charsetDecodePattern, b_text)
		u_text = ""
		encodings = []  # stack of encodings
		defaultEncodingOnly = True
		for i, b_part in enumerate(b_parts):
			if i % 3 == 0:  # text block
				encoding = encodings[-1] if encodings else defaultEncoding
				b_text2 = b_part
				if encoding == "babylon-reference":
					b_refs = b_text2.split(b";")
					for i_ref, b_ref in enumerate(b_refs):
						if not b_ref:
							if i_ref != len(b_refs)-1:
								log.debug(
									"decoding charset tags" +
									", b_text=%r\n" % b_text +
									"blank <charset c=t> character" +
									" reference (%r)\n" % b_text2
								)
							continue
						if not re.match(b"^[0-9a-fA-F]{4}$", b_ref):
							log.debug(
								"decoding charset tags, b_text=%r\n" % b_text +
								"invalid <charset c=t> character" +
								" reference (%r)\n" % b_text2
							)
							continue
						u_text += chr(int(b_ref, 16))
				else:
					self.charReferencesStat(b_text2, encoding)
					if encoding == "cp1252":
						b_text2 = replaceAsciiCharRefs(b_text2, encoding)
					if self.strictStringConvertion:
						try:
							u_text2 = b_text2.decode(encoding)
						except UnicodeError:
							log.debug(
								"decoding charset tags" +
								", b_text=%r" % b_text +
								"\nfragment: %r" % b_text2 +
								"\nconversion error:\n%s" % excMessage()
							)
							u_text2 = text2.decode(encoding, "replace")
					else:
						u_text2 = b_text2.decode(encoding, "replace")
					u_text += u_text2
					if encoding != defaultEncoding:
						defaultEncodingOnly = False
			elif i % 3 == 1:  # <charset...> or </charset>
				if b_part.startswith(b"</"):
					# </charset>
					if encodings:
						encodings.pop()
					else:
						log.debug(
							"decoding charset tags, b_text=%r\n" % b_text +
							"unbalanced </charset> tag\n"
						)
				else:
					# <charset c="?">
					b_type = b_parts[i+1].lower()
					# b_type is a bytes instance, with length 1
					if b_type == b"t":
						encodings.append("babylon-reference")
					elif b_type == b"u":
						encodings.append("utf-8")
					elif b_type == b"k":
						encodings.append(self.sourceEncoding)
					elif b_type == b"e":
						encodings.append(self.sourceEncoding)
					elif b_type == b"g":
						# gbk or gb18030 encoding
						# (not enough data to make distinction)
						encodings.append("gbk")
					else:
						log.debug(
							"decoding charset tags, text = %r\n" % b_text +
							"unknown charset code = %#.2x\n" % ord(b_type)
						)
						# add any encoding to prevent
						# "unbalanced </charset> tag" error
						encodings.append(defaultEncoding)
			else:
				# c attribute of charset tag if the previous tag was charset
				pass
		if encodings:
			log.debug(
				"decoding charset tags, text=%s\n" % b_text +
				"unclosed <charset...> tag\n"
			)
		return u_text, defaultEncodingOnly
Exemplo n.º 6
0
	def decodeCharsetTags(self, b_text, defaultEncoding):
		"""
		b_text is a bytes
		Decode html text taking into account charset tags and default encoding

		Return value: (u_text, defaultEncodingOnly)
		u_text is str
		defaultEncodingOnly parameter is false if the text contains parts
		encoded with non-default encoding (babylon character references
		'<CHARSET c="T">00E6;</CHARSET>' do not count).
		"""
		b_parts = re_charset_decode.split(b_text)
		u_text = ""
		encodings = []  # stack of encodings
		defaultEncodingOnly = True
		for i, b_part in enumerate(b_parts):
			if i % 3 == 0:  # text block
				encoding = encodings[-1] if encodings else defaultEncoding
				b_text2 = b_part
				if encoding == "babylon-reference":
					b_refs = b_text2.split(b";")
					for i_ref, b_ref in enumerate(b_refs):
						if not b_ref:
							if i_ref != len(b_refs) - 1:
								log.debug(
									f"decoding charset tags, b_text={b_text!r}"
									f"\nblank <charset c=t> character"
									f" reference ({b_text2!r})\n"
								)
							continue
						if not re_b_reference.match(b_ref):
							log.debug(
								f"decoding charset tags, b_text={b_text!r}"
								f"\ninvalid <charset c=t> character"
								f" reference ({b_text2!r})\n"
							)
							continue
						u_text += chr(int(b_ref, 16))
				else:
					self.charReferencesStat(b_text2, encoding)
					if encoding == "cp1252":
						b_text2 = replaceAsciiCharRefs(b_text2, encoding)
					if self.strictStringConvertion:
						try:
							u_text2 = b_text2.decode(encoding)
						except UnicodeError:
							log.debug(
								f"decoding charset tags, b_text={b_text!r}"
								f"\nfragment: {b_text2!r}"
								f"\nconversion error:\n" + excMessage()
							)
							u_text2 = text2.decode(encoding, "replace")
					else:
						u_text2 = b_text2.decode(encoding, "replace")
					u_text += u_text2
					if encoding != defaultEncoding:
						defaultEncodingOnly = False
			elif i % 3 == 1:  # <charset...> or </charset>
				if b_part.startswith(b"</"):
					# </charset>
					if encodings:
						encodings.pop()
					else:
						log.debug(
							f"decoding charset tags, b_text={b_text!r}"
							f"\nunbalanced </charset> tag\n"
						)
				else:
					# <charset c="?">
					b_type = b_parts[i + 1].lower()
					# b_type is a bytes instance, with length 1
					if b_type == b"t":
						encodings.append("babylon-reference")
					elif b_type == b"u":
						encodings.append("utf-8")
					elif b_type == b"k":
						encodings.append(self.sourceEncoding)
					elif b_type == b"e":
						encodings.append(self.sourceEncoding)
					elif b_type == b"g":
						# gbk or gb18030 encoding
						# (not enough data to make distinction)
						encodings.append("gbk")
					else:
						log.debug(
							f"decoding charset tags, text = {b_text!r}"
							f"\nunknown charset code = {ord(b_type):#02x}\n"
						)
						# add any encoding to prevent
						# "unbalanced </charset> tag" error
						encodings.append(defaultEncoding)
			else:
				# c attribute of charset tag if the previous tag was charset
				pass
		if encodings:
			log.debug(
				f"decoding charset tags, text={b_text}"
				f"\nunclosed <charset...> tag\n"
			)
		return u_text, defaultEncodingOnly