Exemplo n.º 1
0
def parse_mediawiki_tree(input_file, output_file):
    count = 0
    xml_parser = ElementTree.iterparse(input_file)

    mw_preprocessor = mediawiki_parser.preprocessor.make_parser({})
    mw_parser = wikitextParser.make_parser(toolset())

    output = open(output_file, 'w', encoding='utf-8')

    for event in xml_parser:
        element = event[1]
        if element.tag != "{http://www.mediawiki.org/xml/export-0.10/}text":
            continue

        text = element.text
        input_size = len(text)

        text = parse_mediawiki_to_text(text)
        if text is None:
            continue

        output.write(text)
        count += 1

        output_size = len(text)
        factor = (output_size / input_size) * 100

        timestamp = datetime.datetime.now().time().strftime("%H:%m:%S")
        print(
            f"[{timestamp}] Parsed {count} articles, minified to {factor:.01f}% ({input_size} -> {output_size} bytes)"
        )
Exemplo n.º 2
0
def make_parser(interwiki={}, namespaces={}):
    """Constructs the parser for the text backend.

    :arg interwiki: Dict of the allowed interwiki prefixes (en, fr, es, commons, etc.)
    :arg namespaces: Dict of the namespaces of the wiki (File, Category, Template, etc.),
            including the localized version of those strings (Modele, Categorie, etc.),
            associated to the corresponding namespace code.
    """
    tools = toolset(interwiki, namespaces)
    return wikitextParser.make_parser(tools)
Exemplo n.º 3
0
def make_parser(allowed_tags=[],
                allowed_autoclose_tags=[],
                allowed_attributes=[],
                interwiki={},
                namespaces={}):
    """Constructs the parser for the HTML backend.
    
    :arg allowed_tags: List of the HTML tags that should be allowed in the parsed wikitext.
            Opening tags will be closed. Closing tags with no opening tag will be removed.
            All the tags that are not in the list will be output as <tag>.
    :arg allowed_autoclose_tags: List of the self-closing tags that should be allowed in the
            parsed wikitext. All the other self-closing tags will be output as <tag />
    :arg allowed_attributes: List of the HTML attributes that should be allowed in the parsed
            tags (e.g.: class="", style=""). All the other attributes (e.g.: onclick="") will
            be removed.
    :arg interwiki: List of the allowed interwiki prefixes (en, fr, es, commons, etc.)
    :arg namespaces: List of the namespaces of the wiki (File, Category, Template, etc.),
            including the localized version of those strings (Modele, Categorie, etc.),
            associated to the corresponding namespace code.
    """
    tools = toolset(allowed_tags, allowed_autoclose_tags, allowed_attributes,
                    interwiki, namespaces)
    return wikitextParser.make_parser(tools)
Exemplo n.º 4
0
def make_parser():
    return wikitextParser.make_parser(toolset())