Python parse_table示例，cppman.formatter.tableparser.parse_table Python示例

示例#1

0

显示文件

文件： cppreference.py 项目： BufferNihility/cppman

def member_table_def(g):
    tbl = parse_table('<table>%s</table>' % str(g.group(3)))
    # Escape column with '.' as prefix
    tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\\E \1\nT}', tbl)
    return '\n.IP "%s"\n%s\n%s\n' % (g.group(1), g.group(2), tbl)

示例#2

0

显示文件

文件： cppreference.py 项目： BufferNihility/cppman

def html2groff(data, name):
    """Convert HTML text from cppreference.com to Groff-formatted text."""
    # Remove header and footer
    try:
        data = data[data.index('<div id="cpp-content-base">'):]
        data = data[:data.index('<div class="printfooter">') + 25]
    except ValueError:
        pass

    # Remove non-printable characters
    data = ''.join([x for x in data if x in string.printable])

    for table in re.findall(
            r'<table class="(?:wikitable|dsctable)"[^>]*>.*?</table>',
            data, re.S):
        tbl = parse_table(table)
        # Escape column with '.' as prefix
        tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\\E \1\nT}', tbl)
        data = data.replace(table, tbl)

    # Pre replace all
    for rp in rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    # Remove non-printable characters
    data = ''.join([x for x in data if x in string.printable])

    # Upper case all section headers
    for st in re.findall(r'.SH .*\n', data):
        data = data.replace(st, st.upper())

    # Add tags to member/inherited member functions
    # e.g. insert -> vector::insert
    #
    # .SE is a pseudo macro I created which means 'SECTION END'
    # The reason I use it is because I need a marker to know where section
    # ends.
    # re.findall find patterns which does not overlap, which means if I do
    # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S)
    # re.findall will skip the later .SH tag and thus skip the later section.
    # To fix this, '.SE' is used to mark the end of the section so the next
    # '.SH' can be find by re.findall

    try:
        idx = data.index('.IEND')
    except ValueError:
        idx = None

    def add_header_multi(prefix, g):
        if ',' in g.group(1):
            res = ', '.join(['%s::%s' % (prefix, x.strip())
                            for x in g.group(1).split(',')])
        else:
            res = '%s::%s' % (prefix, g.group(1))

        return '\n.IP "%s"' % res

    if idx:
        class_name = name
        if class_name.startswith('std::'):
            normalized_class_name = class_name[len('std::'):]
        else:
            normalized_class_name = class_name
        class_member_content = data[:idx]
        secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', class_member_content, re.S)

        for sec, content in secs:
            # Member functions
            if ('MEMBER' in sec and
                'NON-MEMBER' not in sec and
                'INHERITED' not in sec and
                 sec != 'MEMBER TYPES'):
                content2 = re.sub(r'\n\.IP "([^:]+?)"',
                                  partial(add_header_multi, class_name),
                                  content)
                # Replace (constructor) (destructor)
                content2 = re.sub(r'\(constructor\)', r'%s' %
                                  normalized_class_name, content2)
                content2 = re.sub(r'\(destructor\)', r'~%s' %
                                  normalized_class_name, content2)
                data = data.replace(content, content2)

    blocks = re.findall(r'\.IBEGIN\s*(.+?)\s*\n(.+?)\.IEND', data, re.S)

    for inherited_class, content in blocks:
        content2 = re.sub(r'\.SH "(.+?)"', r'\n.SH "\1 INHERITED FROM %s"'
                          % inherited_class.upper(), content)
        data = data.replace(content, content2)

        secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', content, re.S)

        for sec, content in secs:
            # Inherited member functions
            if 'MEMBER' in sec and \
               sec != 'MEMBER TYPES':
                content2 = re.sub(r'\n\.IP "(.+)"',
                                  partial(add_header_multi, inherited_class),
                                  content)
                data = data.replace(content, content2)

    # Remove unneeded pseudo macro
    data = re.sub('(?:\n.SE|.IBEGIN.*?\n|\n.IEND)', '', data)

    # Replace all macros
    desc_re = re.search(r'.SH "DESCRIPTION"\n.*?([^\n\s].*?)\n', data)
    shortdesc = ''

    # not empty description
    if desc_re and not desc_re.group(1).startswith('.SH'):
        shortdesc = '- ' + desc_re.group(1)

    def dereference(g):
        d = dict(name=name, shortdesc=shortdesc)
        if g.group(1) in d:
            return d[g.group(1)]

    data = re.sub('{{(.*?)}}', dereference, data)

    return data

示例#3

0

显示文件

文件： cppreference.py 项目： kowr/cppman

def member_table_def(g):
    tbl = parse_table("<table>%s</table>" % str(g.group(3)))
    # Escape column with '.' as prefix
    tbl = re.compile(r"T{\n(\..*?)\nT}", re.S).sub(r"T{\n\E \1\nT}", tbl)
    return '\n.IP "%s"\n%s\n%s\n' % (g.group(1), g.group(2), tbl)

示例#4

0

显示文件

文件： cppreference.py 项目： kowr/cppman

def html2groff(data, name):
    """Convert HTML text from cppreference.com to Groff-formated text."""
    # Remove header and footer
    try:
        data = data[data.index('<div id="cpp-content-base">') :]
        data = data[: data.index('<div class="printfooter">') + 25]
    except ValueError:
        pass

    # Remove non prinatable characters
    data = "".join([x for x in data if x in string.printable])

    for table in re.findall(r'<table class="(?:wikitable|dsctable)"[^>]*>.*?</table>', data, re.S):
        tbl = parse_table(table)
        # Escape column with '.' as prefix
        tbl = re.compile(r"T{\n(\..*?)\nT}", re.S).sub(r"T{\n\E \1\nT}", tbl)
        data = data.replace(table, tbl)

    # Pre replace all
    for rp in rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    # Remove non prinatable characters
    data = "".join([x for x in data if x in string.printable])

    # Upper case all section headers
    for st in re.findall(r".SH .*\n", data):
        data = data.replace(st, st.upper())

    # Add tags to member/inherited member functions
    # e.g. insert -> vector::insert
    #
    # .SE is a pseudo macro I created which means 'SECTION END'
    # The reason I use it is because I need a marker to know where section
    # ends.
    # re.findall find patterns which does not overlap, which means if I do
    # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S)
    # re.findall will skip the later .SH tag and thus skip the later section.
    # To fix this, '.SE' is used to mark the end of the section so the next
    # '.SH' can be find by re.findall

    try:
        idx = data.index(".IEND")
    except ValueError:
        idx = None

    def add_header_multi(prefix, g):
        if "," in g.group(1):
            res = ", ".join(["%s::%s" % (prefix, x.strip()) for x in g.group(1).split(",")])
        else:
            res = "%s::%s" % (prefix, g.group(1))

        return '\n.IP "%s"' % res

    if idx:
        class_name = name
        if class_name.startswith("std::"):
            normalized_class_name = class_name[len("std::") :]
        else:
            normalized_class_name = class_name
        class_member_content = data[:idx]
        secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', class_member_content, re.S)

        for sec, content in secs:
            # Member functions
            if "MEMBER" in sec and "NON-MEMBER" not in sec and "INHERITED" not in sec and sec != "MEMBER TYPES":
                content2 = re.sub(r'\n\.IP "([^:]+?)"', partial(add_header_multi, class_name), content)
                # Replace (constructor) (destructor)
                content2 = re.sub(r"\(constructor\)", r"%s" % normalized_class_name, content2)
                content2 = re.sub(r"\(destructor\)", r"~%s" % normalized_class_name, content2)
                data = data.replace(content, content2)

    blocks = re.findall(r"\.IBEGIN\s*(.+?)\s*\n(.+?)\.IEND", data, re.S)

    for inherited_class, content in blocks:
        content2 = re.sub(r'\.SH "(.+?)"', r'\n.SH "\1 INHERITED FROM %s"' % inherited_class.upper(), content)
        data = data.replace(content, content2)

        secs = re.findall(r'\.SH "(.+?)"(.+?)\.SE', content, re.S)

        for sec, content in secs:
            # Inherited member functions
            if "MEMBER" in sec and sec != "MEMBER TYPES":
                content2 = re.sub(r'\n\.IP "(.+)"', partial(add_header_multi, inherited_class), content)
                data = data.replace(content, content2)

    # Remove uneeded pseudo macro
    data = re.sub("(?:\n.SE|.IBEGIN.*?\n|\n.IEND)", "", data)

    # Replace all macros
    desc_re = re.search(r'.SH "DESCRIPTION"\n.*?([^\n\s].*?)\n', data)
    shortdesc = ""

    # not empty description
    if desc_re and not desc_re.group(1).startswith(".SH"):
        shortdesc = "- " + desc_re.group(1)

    def dereference(g):
        d = dict(name=name, shortdesc=shortdesc)
        if g.group(1) in d:
            return d[g.group(1)]

    data = re.sub("{{(.*?)}}", dereference, data)

    return data

示例#5

0

显示文件

文件： cplusplus.py 项目： zlsvn/cppman

def html2groff(data, name):
    """Convert HTML text from cplusplus.com to Groff-formatted text."""
    # Remove sidebar
    try:
        data = data[data.index('<div class="C_doc">'):]
    except ValueError:
        pass

    # Pre replace all
    for rp in pre_rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    for table in re.findall(r'<table.*?>.*?</table>', data, re.S):
        tbl = parse_table(escape_pre_section(table))
        # Escape column with '.' as prefix
        tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\E \1\nT}', tbl)
        data = data.replace(table, tbl)

    # Replace all
    for rp in rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    # Upper case all section headers
    for st in re.findall(r'.SH .*\n', data):
        data = data.replace(st, st.upper())

    # Add tags to member/inherited member functions
    # e.g. insert -> vector::insert
    #
    # .SE is a pseudo macro I created which means 'SECTION END'
    # The reason I use it is because I need a marker to know where section
    # ends.
    # re.findall find patterns which does not overlap, which means if I do
    # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S)
    # re.findall will skip the later .SH tag and thus skip the later section.
    # To fix this, '.SE' is used to mark the end of the section so the next
    # '.SH' can be find by re.findall

    page_type = re.search(r'\n\.SH "TYPE"\n(.+?)\n', data)
    if page_type and 'class' in page_type.group(1):
        class_name = re.search(r'\n\.SH "NAME"\n(?:.*::)?(.+?) ',
                               data).group(1)

        secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SE', data, re.S)

        for sec, content in secs:
            # Member functions
            if ('MEMBER' in sec and 'NON-MEMBER' not in sec
                    and 'INHERITED' not in sec and sec != 'MEMBER TYPES'):
                content2 = re.sub(r'\n\.IP "([^:]+?)"',
                                  r'\n.IP "%s::\1"' % class_name, content)
                # Replace (constructor) (destructor)
                content2 = re.sub(r'\(constructor\)', r'%s' % class_name,
                                  content2)
                content2 = re.sub(r'\(destructor\)', r'~%s' % class_name,
                                  content2)
                data = data.replace(content, content2)
            # Inherited member functions
            elif 'MEMBER' in sec and 'INHERITED' in sec:
                inherit = re.search(r'.+?INHERITED FROM (.+)',
                                    sec).group(1).lower()
                content2 = re.sub(r'\n\.IP "(.+)"',
                                  r'\n.IP "%s::\1"' % inherit, content)
                data = data.replace(content, content2)

    # Remove pseudo macro '.SE'
    data = data.replace('\n.SE', '')

    return data

示例#6

0

显示文件

文件： cplusplus.py 项目： cbsmith/cppman

def html2groff(data, name):
    """Convert HTML text from cplusplus.com to Groff-formated text."""
    # Remove sidebar
    try:
        data = data[data.index('<div class="C_doc">'):]
    except ValueError:
        pass

    # Replace all
    for rp in pre_rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    for table in re.findall(r'<table.*?>.*?</table>', data, re.S):
        tbl = parse_table(table)
        # Escape column with '.' as prefix
        tbl = re.compile(r'T{\n(\..*?)\nT}', re.S).sub(r'T{\n\E \1\nT}', tbl)
        data = data.replace(table, tbl)

    # Pre replace all
    for rp in rps:
        data = re.compile(rp[0], rp[2]).sub(rp[1], data)

    # Upper case all section headers
    for st in re.findall(r'.SH .*\n', data):
        data = data.replace(st, st.upper())

    # Add tags to member/inherited member functions
    # e.g. insert -> vector::insert
    #
    # .SE is a pseudo macro I created which means 'SECTION END'
    # The reason I use it is because I need a marker to know where section
    # ends.
    # re.findall find patterns which does not overlap, which means if I do
    # this: secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SH', data, re.S)
    # re.findall will skip the later .SH tag and thus skip the later section.
    # To fix this, '.SE' is used to mark the end of the section so the next
    # '.SH' can be find by re.findall

    page_type = re.search(r'\n\.SH "TYPE"\n(.+?)\n', data)
    if page_type and 'class' in page_type.group(1):
        class_name = re.search(r'\n\.SH "NAME"\n(?:.*::)?(.+?) ', data).group(1)

        secs = re.findall(r'\n\.SH "(.+?)"(.+?)\.SE', data, re.S)

        for sec, content in secs:
            # Member functions
            if 'MEMBER' in sec and 'INHERITED' not in sec and\
               sec != 'MEMBER TYPES':
                content2 = re.sub(r'\n\.IP "([^:]+?)"', r'\n.IP "%s::\1"'
                                  % class_name, content)
                # Replace (constructor) (destructor)
                content2 = re.sub(r'\(constructor\)', r'%s' % class_name,
                                  content2)
                content2 = re.sub(r'\(destructor\)', r'~%s' % class_name,
                                  content2)
                data = data.replace(content, content2)
            # Inherited member functions
            elif 'MEMBER' in sec and 'INHERITED' in sec:
                inherit = re.search(r'.+?INHERITED FROM (.+)',
                                    sec).group(1).lower()
                content2 = re.sub(r'\n\.IP "(.+)"', r'\n.IP "%s::\1"'
                                  % inherit, content)
                data = data.replace(content, content2)

    # Remove pseudo macro '.SE'
    data = data.replace('\n.SE', '')

    return data