def unescape(result: MarkedUpText) -> None: """ a "private" method to replace HTML codes like > with corresponding symbols in the resulting plain text :param result: MarkedUpText containing resulting plain text """ new_text = '' transformations = [ ] # type: List[Tuple[Tuple[int, int], Tuple[int, int]]] last_stop = 0 for match in _charref.finditer(result.text): replacement = _replace_charref(match) src_s, src_e = (match.start(), match.end()) end_e = src_s + len(replacement) if end_e != src_e: transformations.append(((src_s, src_e), (src_s, end_e))) new_text += result.text[last_stop:src_s] new_text += replacement last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text if transformations: result.apply_transformations(transformations)
def convert_to_styled(self, data): """ This scans incoming notes for possible html. It converts a select few tags into StyledText and removes the rest of the tags. Notes of this type occur in data from FTM and ancestry.com. Result is a much cleaner note. @param data: a string of text possibly containg html @type data: str """ prev = 0 chunkpos = 0 chunks = [] italics = [] bolds = [] unders = [] links = [] reds = [] bldpos = -1 # data = html.unescape(data) # clean up escaped html "<" etc. for mo in re.finditer(html._charref, data._string): out = html._replace_charref(mo) in_start = mo.start() in_end = mo.end() data._string = (data._string[:in_start] + out + data._string[(in_start + len(out)):]) if prev != in_start + len(out): chunks.append(data[prev:(in_start + len(out))]) chunkpos += (in_start - prev + len(out)) prev = in_end chunks.append(data[prev:]) data = StyledText().join(chunks) prev = 0 chunkpos = 0 chunks = [] for mo in re.finditer(self.tok_regex, data._string, flags=(re.DOTALL | re.I)): kind = mo.lastgroup st_txt = mo.group(kind) in_start = mo.start() in_end = mo.end() if kind == 'SKIP' or kind == 'TABLE': if prev != in_start: chunks.append(data[prev:in_start]) chunkpos += (in_start - prev) elif kind == 'PARAEND': chunks.append(data[prev:in_start] + '\n') chunkpos += (in_start - prev + 1) elif kind == 'ITALIC': chunks.append(data[prev:in_start] + data[(in_start + 3):in_end]) newpos = chunkpos - prev + in_end - 3 italics.append((chunkpos + in_start - prev, newpos)) chunkpos = newpos elif kind == 'BOLD': chunks.append(data[prev:in_start] + data[(in_start + 3):in_end]) newpos = chunkpos - prev + in_end - 3 bolds.append((chunkpos + in_start - prev, newpos)) chunkpos = newpos elif kind == 'UNDER': chunks.append(data[prev:in_start] + data[(in_start + 3):in_end]) newpos = chunkpos - prev + in_end - 3 unders.append((chunkpos + in_start - prev, newpos)) chunkpos = newpos elif kind == 'HTTP': # HTTP found st_txt = mo.group('HTTP') oldpos = chunkpos + in_start - prev chunks.append(data[prev:in_start] + st_txt) chunkpos += (in_start - prev + len(st_txt)) st_txt = st_txt.rstrip(' .:)') newpos = oldpos + len(st_txt) links.append((st_txt, oldpos, newpos)) elif kind == 'HREF': # HREF found st_txt = mo.group('HREFT') lk_txt = mo.group('HREFL') # fix up relative links emmitted by ancestry.com if (lk_txt.startswith("/search/dbextra") or lk_txt.startswith("/handler/domain")): lk_txt = "http://search.ancestry.com" + lk_txt oldpos = chunkpos + in_start - prev # if tag (minus any trailing '.') is substring of link if st_txt[0:-1] in lk_txt: st_txt = lk_txt # just use the link else: # use link and tag st_txt = " " + lk_txt + " (" + st_txt + ")" newpos = oldpos + len(st_txt) chunks.append(data[prev:in_start] + st_txt) chunkpos += (in_start - prev + len(st_txt)) links.append((lk_txt, oldpos, newpos)) elif kind == 'TBLCELL' or kind == 'TBLHDRC': # Table cell break chunks.append(data[prev:in_start] + ': ') chunkpos += (in_start - prev + 3) elif kind == 'TBLHDRB': # header start if prev != in_start: chunks.append(data[prev:in_start]) chunkpos += (in_start - prev) bldpos = chunkpos elif kind == 'TBLHDRE': # Header end if bldpos == -1: if prev != in_start: chunks.append(data[prev:in_end]) newpos = chunkpos - prev + in_end reds.append((chunkpos + in_start - prev, newpos)) chunkpos = newpos print('Invalid table header, no start tag found') else: if prev != in_start: chunks.append(data[prev:in_start]) chunkpos += (in_start - prev) bolds.append((bldpos, chunkpos)) bldpos = -1 elif kind == 'UNKNWN': chunks.append(data[prev:in_end]) newpos = chunkpos - prev + in_end reds.append((chunkpos + in_start - prev, newpos)) chunkpos = newpos print('Unexpected or unimplemented HTML tag', st_txt) else: print("shouldn't get here") prev = in_end chunks.append(data[prev:]) result = StyledText().join(chunks) tags = [] for link in links: tags.append( StyledTextTag(StyledTextTagType.LINK, link[0], [(link[1], link[2])])) if italics: tags.append(StyledTextTag(StyledTextTagType.ITALIC, False, italics)) if bolds: tags.append(StyledTextTag(StyledTextTagType.BOLD, False, bolds)) if unders: tags.append( StyledTextTag(StyledTextTagType.UNDERLINE, False, unders)) if reds: tags.append( StyledTextTag(StyledTextTagType.HIGHLIGHT, '#FFFF00', reds)) return StyledText(result._string, tag_merge(result._tags, tags))