def member_table_def(g): tbl = parse_table('<table>%s</table>' % str(g.group(3))) # Escape column with '.' as prefix tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\\E \1\nT}', tbl) return '\n.IP "%s"\n%s\n%s\n' % (g.group(1), g.group(2), tbl)
def html2groff(data, name): """Convert HTML text from cppreference.com to Groff-formatted text.""" # Remove header and footer try: data = data[data.index('<div id="cpp-content-base">'):] data = data[:data.index('<div class="printfooter">') + 25] except ValueError: pass # Remove non-printable characters data = ''.join([x for x in data if x in string.printable]) for table in re.findall( r'<table class="(?:wikitable|dsctable)"[^>]*>.*?</table>', data, re.S): tbl = parse_table(table) # Escape column with '.' as prefix tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\\E \1\nT}', tbl) data = data.replace(table, tbl) # Pre replace all for rp in rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) # Remove non-printable characters data = ''.join([x for x in data if x in string.printable]) # Upper case all section headers for st in re.findall(r'.SH .*\n', data): data = data.replace(st, st.upper()) # Add tags to member/inherited member functions # e.g. insert -> vector::insert # # .SE is a pseudo macro I created which means 'SECTION END' # The reason I use it is because I need a marker to know where section # ends. # re.findall find patterns which does not overlap, which means if I do # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S) # re.findall will skip the later .SH tag and thus skip the later section. # To fix this, '.SE' is used to mark the end of the section so the next # '.SH' can be find by re.findall try: idx = data.index('.IEND') except ValueError: idx = None def add_header_multi(prefix, g): if ',' in g.group(1): res = ', '.join(['%s::%s' % (prefix, x.strip()) for x in g.group(1).split(',')]) else: res = '%s::%s' % (prefix, g.group(1)) return '\n.IP "%s"' % res if idx: class_name = name if class_name.startswith('std::'): normalized_class_name = class_name[len('std::'):] else: normalized_class_name = class_name class_member_content = data[:idx] secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', class_member_content, re.S) for sec, content in secs: # Member functions if ('MEMBER' in sec and 'NON-MEMBER' not in sec and 'INHERITED' not in sec and sec != 'MEMBER TYPES'): content2 = re.sub(r'\n\.IP "([^:]+?)"', partial(add_header_multi, class_name), content) # Replace (constructor) (destructor) content2 = re.sub(r'\(constructor\)', r'%s' % normalized_class_name, content2) content2 = re.sub(r'\(destructor\)', r'~%s' % normalized_class_name, content2) data = data.replace(content, content2) blocks = re.findall(r'\.IBEGIN\s*(.+?)\s*\n(.+?)\.IEND', data, re.S) for inherited_class, content in blocks: content2 = re.sub(r'\.SH "(.+?)"', r'\n.SH "\1 INHERITED FROM %s"' % inherited_class.upper(), content) data = data.replace(content, content2) secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', content, re.S) for sec, content in secs: # Inherited member functions if 'MEMBER' in sec and \ sec != 'MEMBER TYPES': content2 = re.sub(r'\n\.IP "(.+)"', partial(add_header_multi, inherited_class), content) data = data.replace(content, content2) # Remove unneeded pseudo macro data = re.sub('(?:\n.SE|.IBEGIN.*?\n|\n.IEND)', '', data) # Replace all macros desc_re = re.search(r'.SH "DESCRIPTION"\n.*?([^\n\s].*?)\n', data) shortdesc = '' # not empty description if desc_re and not desc_re.group(1).startswith('.SH'): shortdesc = '- ' + desc_re.group(1) def dereference(g): d = dict(name=name, shortdesc=shortdesc) if g.group(1) in d: return d[g.group(1)] data = re.sub('{{(.*?)}}', dereference, data) return data
def member_table_def(g): tbl = parse_table("<table>%s</table>" % str(g.group(3))) # Escape column with '.' as prefix tbl = re.compile(r"T{\n(\..*?)\nT}", re.S).sub(r"T{\n\E \1\nT}", tbl) return '\n.IP "%s"\n%s\n%s\n' % (g.group(1), g.group(2), tbl)
def html2groff(data, name): """Convert HTML text from cppreference.com to Groff-formated text.""" # Remove header and footer try: data = data[data.index('<div id="cpp-content-base">') :] data = data[: data.index('<div class="printfooter">') + 25] except ValueError: pass # Remove non prinatable characters data = "".join([x for x in data if x in string.printable]) for table in re.findall(r'<table class="(?:wikitable|dsctable)"[^>]*>.*?</table>', data, re.S): tbl = parse_table(table) # Escape column with '.' as prefix tbl = re.compile(r"T{\n(\..*?)\nT}", re.S).sub(r"T{\n\E \1\nT}", tbl) data = data.replace(table, tbl) # Pre replace all for rp in rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) # Remove non prinatable characters data = "".join([x for x in data if x in string.printable]) # Upper case all section headers for st in re.findall(r".SH .*\n", data): data = data.replace(st, st.upper()) # Add tags to member/inherited member functions # e.g. insert -> vector::insert # # .SE is a pseudo macro I created which means 'SECTION END' # The reason I use it is because I need a marker to know where section # ends. # re.findall find patterns which does not overlap, which means if I do # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S) # re.findall will skip the later .SH tag and thus skip the later section. # To fix this, '.SE' is used to mark the end of the section so the next # '.SH' can be find by re.findall try: idx = data.index(".IEND") except ValueError: idx = None def add_header_multi(prefix, g): if "," in g.group(1): res = ", ".join(["%s::%s" % (prefix, x.strip()) for x in g.group(1).split(",")]) else: res = "%s::%s" % (prefix, g.group(1)) return '\n.IP "%s"' % res if idx: class_name = name if class_name.startswith("std::"): normalized_class_name = class_name[len("std::") :] else: normalized_class_name = class_name class_member_content = data[:idx] secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', class_member_content, re.S) for sec, content in secs: # Member functions if "MEMBER" in sec and "NON-MEMBER" not in sec and "INHERITED" not in sec and sec != "MEMBER TYPES": content2 = re.sub(r'\n\.IP "([^:]+?)"', partial(add_header_multi, class_name), content) # Replace (constructor) (destructor) content2 = re.sub(r"\(constructor\)", r"%s" % normalized_class_name, content2) content2 = re.sub(r"\(destructor\)", r"~%s" % normalized_class_name, content2) data = data.replace(content, content2) blocks = re.findall(r"\.IBEGIN\s*(.+?)\s*\n(.+?)\.IEND", data, re.S) for inherited_class, content in blocks: content2 = re.sub(r'\.SH "(.+?)"', r'\n.SH "\1 INHERITED FROM %s"' % inherited_class.upper(), content) data = data.replace(content, content2) secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', content, re.S) for sec, content in secs: # Inherited member functions if "MEMBER" in sec and sec != "MEMBER TYPES": content2 = re.sub(r'\n\.IP "(.+)"', partial(add_header_multi, inherited_class), content) data = data.replace(content, content2) # Remove uneeded pseudo macro data = re.sub("(?:\n.SE|.IBEGIN.*?\n|\n.IEND)", "", data) # Replace all macros desc_re = re.search(r'.SH "DESCRIPTION"\n.*?([^\n\s].*?)\n', data) shortdesc = "" # not empty description if desc_re and not desc_re.group(1).startswith(".SH"): shortdesc = "- " + desc_re.group(1) def dereference(g): d = dict(name=name, shortdesc=shortdesc) if g.group(1) in d: return d[g.group(1)] data = re.sub("{{(.*?)}}", dereference, data) return data
def html2groff(data, name): """Convert HTML text from cplusplus.com to Groff-formatted text.""" # Remove sidebar try: data = data[data.index('<div class="C_doc">'):] except ValueError: pass # Pre replace all for rp in pre_rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) for table in re.findall(r'<table.*?>.*?</table>', data, re.S): tbl = parse_table(escape_pre_section(table)) # Escape column with '.' as prefix tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\E \1\nT}', tbl) data = data.replace(table, tbl) # Replace all for rp in rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) # Upper case all section headers for st in re.findall(r'.SH .*\n', data): data = data.replace(st, st.upper()) # Add tags to member/inherited member functions # e.g. insert -> vector::insert # # .SE is a pseudo macro I created which means 'SECTION END' # The reason I use it is because I need a marker to know where section # ends. # re.findall find patterns which does not overlap, which means if I do # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S) # re.findall will skip the later .SH tag and thus skip the later section. # To fix this, '.SE' is used to mark the end of the section so the next # '.SH' can be find by re.findall page_type = re.search(r'\n\.SH "TYPE"\n(.+?)\n', data) if page_type and 'class' in page_type.group(1): class_name = re.search(r'\n\.SH "NAME"\n(?:.*::)?(.+?) ', data).group(1) secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SE', data, re.S) for sec, content in secs: # Member functions if ('MEMBER' in sec and 'NON-MEMBER' not in sec and 'INHERITED' not in sec and sec != 'MEMBER TYPES'): content2 = re.sub(r'\n\.IP "([^:]+?)"', r'\n.IP "%s::\1"' % class_name, content) # Replace (constructor) (destructor) content2 = re.sub(r'\(constructor\)', r'%s' % class_name, content2) content2 = re.sub(r'\(destructor\)', r'~%s' % class_name, content2) data = data.replace(content, content2) # Inherited member functions elif 'MEMBER' in sec and 'INHERITED' in sec: inherit = re.search(r'.+?INHERITED FROM (.+)', sec).group(1).lower() content2 = re.sub(r'\n\.IP "(.+)"', r'\n.IP "%s::\1"' % inherit, content) data = data.replace(content, content2) # Remove pseudo macro '.SE' data = data.replace('\n.SE', '') return data
def html2groff(data, name): """Convert HTML text from cplusplus.com to Groff-formated text.""" # Remove sidebar try: data = data[data.index('<div class="C_doc">'):] except ValueError: pass # Replace all for rp in pre_rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) for table in re.findall(r'<table.*?>.*?</table>', data, re.S): tbl = parse_table(table) # Escape column with '.' as prefix tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\E \1\nT}', tbl) data = data.replace(table, tbl) # Pre replace all for rp in rps: data = re.compile(rp[0], rp[2]).sub(rp[1], data) # Upper case all section headers for st in re.findall(r'.SH .*\n', data): data = data.replace(st, st.upper()) # Add tags to member/inherited member functions # e.g. insert -> vector::insert # # .SE is a pseudo macro I created which means 'SECTION END' # The reason I use it is because I need a marker to know where section # ends. # re.findall find patterns which does not overlap, which means if I do # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S) # re.findall will skip the later .SH tag and thus skip the later section. # To fix this, '.SE' is used to mark the end of the section so the next # '.SH' can be find by re.findall page_type = re.search(r'\n\.SH "TYPE"\n(.+?)\n', data) if page_type and 'class' in page_type.group(1): class_name = re.search(r'\n\.SH "NAME"\n(?:.*::)?(.+?) ', data).group(1) secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SE', data, re.S) for sec, content in secs: # Member functions if 'MEMBER' in sec and 'INHERITED' not in sec and\ sec != 'MEMBER TYPES': content2 = re.sub(r'\n\.IP "([^:]+?)"', r'\n.IP "%s::\1"' % class_name, content) # Replace (constructor) (destructor) content2 = re.sub(r'\(constructor\)', r'%s' % class_name, content2) content2 = re.sub(r'\(destructor\)', r'~%s' % class_name, content2) data = data.replace(content, content2) # Inherited member functions elif 'MEMBER' in sec and 'INHERITED' in sec: inherit = re.search(r'.+?INHERITED FROM (.+)', sec).group(1).lower() content2 = re.sub(r'\n\.IP "(.+)"', r'\n.IP "%s::\1"' % inherit, content) data = data.replace(content, content2) # Remove pseudo macro '.SE' data = data.replace('\n.SE', '') return data