Пример #1
0
def replaceHtmlEntryNoEscapeCB(u_match):
    """
        u_match: instance of _sre.SRE_Match
        Replace character entity with the corresponding character

        Return the original string if conversion fails.
        Use this as a replace function of re.sub.
    """
    import html.entities
    from pyglossary.html_utils import name2codepoint

    u_text = u_match.group(0)
    u_name = u_match.group(1)
    if log.isDebug(): assert isinstance(u_text, str) and isinstance(u_name, str)

    u_res = None
    if u_text[:2] == '&#':
        # character reference
        try:
            if u_text[:3].lower() == '&#x':
                code = int(u_name, 16)
            else:
                code = int(u_name)
            if code <= 0:
                raise ValueError()
            u_res = chr(code)
        except (ValueError, OverflowError):
            u_res = chr(0xFFFD) # replacement character
    elif u_text[0] == '&':
        # named entity
        try:
            u_res = chr(html.entities.name2codepoint[u_name])
        except KeyError:
            try:
                u_res = chr(name2codepoint[u_name.lower()])
            except KeyError:
                """
                    Babylon dictionaries contain a lot of non-standard entity references,
                    for example, csdot, fllig, nsm, cancer, thlig, tsdot, upslur...
                    This not just a typo. These entries repeat over and over again.
                    Perhaps they had meaning in the source dictionary that was converted to Babylon,
                    but now the meaning is lost. Babylon does render them as is, that is, for example,
                    &csdot; despite other references like &amp; are replaced with corresponding
                    characters.
                """
                if not u_text in unkownHtmlEntries:
                    log.debug('unknown html entity %s'%u_text)
                    unkownHtmlEntries.add(u_text)
                u_res = u_text
    else:
        raise ArgumentError()
    return u_res
Пример #2
0
def stripDollarIndexes(b_word):
	if log.isDebug():
		assert isinstance(b_word, bytes)
	i = 0
	b_word_main = b""
	strip_count = 0  # number of sequences found
	# strip $<index>$ sequences
	while True:
		d0 = b_word.find(b"$", i)
		if d0 == -1:
			b_word_main += b_word[i:]
			break
		d1 = b_word.find(b"$", d0 + 1)
		if d1 == -1:
			# log.debug(
			#	f"stripDollarIndexes({b_word}):\npaired $ is not found",
			# )
			b_word_main += b_word[i:]
			break
		if d1 == d0 + 1:
			"""
			You may find keys (or alternative keys) like these:
			sur l'arbre$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
			obscurantiste$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
			They all end on a sequence of b'$', key length including dollars
			is always 60 chars.
			You may find keys like these:
			extremidade-$$$-$$$-linha
			.FIRM$$$$$$$$$$$$$
			etc

			summary: we must remove any sequence of dollar signs longer
			than 1 chars
			"""
			# log.debug(f"stripDollarIndexes({b_word}):\nfound $$")
			b_word_main += b_word[i:d0]
			i = d1 + 1
			while i < len(b_word) and b_word[i] == ord(b"$"):
				i += 1
			if i >= len(b_word):
				break
			continue
		if b_word[d0 + 1:d1].strip(b"0123456789"):
			# if has at least one non-digit char
			# log.debug(f"stripDollarIndexes({b_word}):\nnon-digit between $$")
			b_word_main += b_word[i:d1]
			i = d1
			continue
		if d1 + 1 < len(b_word) and b_word[d1 + 1] != 0x20:
			"""
			Examples:
		make do$4$/make /do
		potere$1$<BR><BR>
		See also <a href='file://ITAL-ENG POTERE 1249-1250.pdf'>notes...</A>
		volere$1$<BR><BR>
		See also <a href='file://ITAL-ENG VOLERE 1469-1470.pdf'>notes...</A>
		Ihre$1$Ihres
			"""
			log.debug(
				f"stripDollarIndexes({b_word}):\n"
				f"second $ is followed by non-space"
			)
			pass
		b_word_main += b_word[i:d0]
		i = d1 + 1
		strip_count += 1

	return b_word_main, strip_count
Пример #3
0
def stripDollarIndexes(b_word):
	if log.isDebug():
		assert isinstance(b_word, bytes)
	i = 0
	b_word_main = b""
	strip_count = 0  # number of sequences found
	# strip $<index>$ sequences
	while True:
		d0 = b_word.find(b"$", i)
		if d0 == -1:
			b_word_main += b_word[i:]
			break
		d1 = b_word.find(b"$", d0+1)
		if d1 == -1:
			# log.debug(
			#	"stripDollarIndexes(%s):\npaired $ is not found" % b_word
			# )
			b_word_main += b_word[i:]
			break
		if d1 == d0+1:
			"""
			You may find keys (or alternative keys) like these:
			sur l'arbre$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
			obscurantiste$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
			They all end on a sequence of b'$', key length including dollars
			is always 60 chars.
			You may find keys like these:
			extremidade-$$$-$$$-linha
			.FIRM$$$$$$$$$$$$$
			etc

			summary: we must remove any sequence of dollar signs longer
			than 1 chars
			"""
			# log.debug("stripDollarIndexes(%s):\nfound $$"%b_word)
			b_word_main += b_word[i:d0]
			i = d1 + 1
			while i < len(b_word) and b_word[i] == ord(b"$"):
				i += 1
			if i >= len(b_word):
				break
			continue
		if b_word[d0+1:d1].strip(b"0123456789"):
			# if has at least one non-digit char
			# log.debug(
			#	"stripDollarIndexes(%s):\nnon-digit between $$'%b_word
			# )
			b_word_main += b_word[i:d1]
			i = d1
			continue
		if d1+1 < len(b_word) and b_word[d1+1] != 0x20:
			"""
			Examples:
		make do$4$/make /do
		potere$1$<BR><BR>
		See also <a href='file://ITAL-ENG POTERE 1249-1250.pdf'>notes...</A>
		volere$1$<BR><BR>
		See also <a href='file://ITAL-ENG VOLERE 1469-1470.pdf'>notes...</A>
		Ihre$1$Ihres
			"""
			log.debug(
				"stripDollarIndexes(%s):\n" % b_word +
				"second $ is followed by non-space"
			)
			pass
		b_word_main += b_word[i:d0]
		i = d1+1
		strip_count += 1

	return b_word_main, strip_count