示例#1
0
def extract_snippets(wikitext, minlen, maxlen):
    snippets = [] # [section, [snippets]]

    wikicode = fast_parse(wikitext)
    if wikicode is None:
        # Fall back to full parsing if fast parsing fails
        wikicode = mwparserfromhell.parse(wikitext)
    sections = wikicode.get_sections(
        include_lead = True, include_headings = True, flat = True)

    for i, section in enumerate(sections):
        assert i == 0 or \
            isinstance(section.get(0), mwparserfromhell.nodes.heading.Heading)
        sectitle = unicode(section.get(0).title.strip()) if i != 0 else ''
        secsnippets = []
        snippets.append([sectitle, secsnippets])

        paragraphs = section.split('\n\n')
        for paragraph in paragraphs:
            # Invoking a string method on a Wikicode object returns a string,
            # so we need to parse it again :(
            wikicode = mwparserfromhell.parse(paragraph)

            blacklisted_tag_or_template = itertools.chain(
                (tag.tag in cfg.tags_blacklist
                    for tag in wikicode.filter_tags()),
                (matches_any(tpl, cfg.templates_blacklist)
                    for tpl in wikicode.filter_templates()),
            )
            if any(blacklisted_tag_or_template):
                continue

            snippet = cleanup_snippet(wikicode.strip_code())
            if '\n' in snippet:
                # Lists cause more 'paragraphs' to be generated
                paragraphs.extend(snippet.split('\n'))
                continue

            if CITATION_NEEDED_MARKER not in snippet:
                # marker may have been inside wiki markup
                continue

            usable_len = (
                len(snippet) -
                (len(CITATION_NEEDED_MARKER) *
                    snippet.count(CITATION_NEEDED_MARKER)) -
                (len(REF_MARKER) *
                    snippet.count(REF_MARKER)))
            if usable_len > maxlen or usable_len < minlen:
                continue
            secsnippets.append(snippet)
    return snippets
示例#2
0
def extract_snippets(wikitext, minlen = 80, maxlen = 560):
    snippets = [] # [section, [snippets]]

    sections = mwparserfromhell.parse(wikitext).get_sections(
        include_lead = True, include_headings = True, flat = True)
    assert ''.join(unicode(s) for s in sections) == d(wikitext)

    for i, section in enumerate(sections):
        assert i == 0 or \
            isinstance(section.get(0), mwparserfromhell.nodes.heading.Heading)
        sectitle = unicode(section.get(0).title.strip()) if i != 0 else ''
        secsnippets = []
        snippets.append([sectitle, secsnippets])

        paragraphs = section.split('\n\n')
        for paragraph in paragraphs:
            wikicode = mwparserfromhell.parse(paragraph)

            blacklisted_tag_or_template = itertools.chain(
                (tag.tag in cfg.tags_blacklist
                    for tag in wikicode.filter_tags()),
                (matches_any(tpl, cfg.templates_blacklist)
                    for tpl in wikicode.filter_templates()),
            )
            if any(blacklisted_tag_or_template):
                continue

            snippet = cleanup_snippet(wikicode.strip_code())
            if '\n' in snippet:
                # Lists cause more 'paragraphs' to be generated
                paragraphs.extend(snippet.split('\n'))
                continue

            if CITATION_NEEDED_MARKER not in snippet:
                # marker may have been inside wiki markup
                continue

            usable_len = (
                len(snippet) -
                (len(CITATION_NEEDED_MARKER) *
                    snippet.count(CITATION_NEEDED_MARKER)) -
                (len(REF_MARKER) *
                    snippet.count(REF_MARKER)))
            if usable_len > maxlen or usable_len < minlen:
                continue
            secsnippets.append(snippet)
    return snippets
示例#3
0
def extract_snippets(wikitext, minlen=80, maxlen=560):
    snippets = []  # [section, [snippets]]

    sections = mwparserfromhell.parse(wikitext).get_sections(
        include_lead=True, include_headings=True, flat=True)
    assert ''.join(unicode(s) for s in sections) == d(wikitext)

    for i, section in enumerate(sections):
        assert i == 0 or \
            isinstance(section.get(0), mwparserfromhell.nodes.heading.Heading)
        sectitle = unicode(section.get(0).title.strip()) if i != 0 else ''
        secsnippets = []
        snippets.append([sectitle, secsnippets])

        paragraphs = section.split('\n\n')
        for paragraph in paragraphs:
            wikicode = mwparserfromhell.parse(paragraph)

            blacklisted_tag_or_template = itertools.chain(
                (tag.tag in cfg.tags_blacklist
                 for tag in wikicode.filter_tags()),
                (matches_any(tpl, cfg.templates_blacklist)
                 for tpl in wikicode.filter_templates()),
            )
            if any(blacklisted_tag_or_template):
                continue

            snippet = cleanup_snippet(wikicode.strip_code())
            if '\n' in snippet:
                # Lists cause more 'paragraphs' to be generated
                paragraphs.extend(snippet.split('\n'))
                continue

            if CITATION_NEEDED_MARKER not in snippet:
                # marker may have been inside wiki markup
                continue

            usable_len = (len(snippet) -
                          (len(CITATION_NEEDED_MARKER) *
                           snippet.count(CITATION_NEEDED_MARKER)) -
                          (len(REF_MARKER) * snippet.count(REF_MARKER)))
            if usable_len > maxlen or usable_len < minlen:
                continue
            secsnippets.append(snippet)
    return snippets
示例#4
0
def extract_snippets(wikitext, minlen, maxlen):
    """Extracts snippets lacking citations.

    This function looks for snippets of the article that are marked with any of
    the templates in `cfg.citation_needed_templates` from the `wikitext` passed
    as parameter, and returns those that are greater than `minlen` but smaller
    than `maxlen`.

    The return value is a list of lists of the form:
        [
            [<section1>, [<snippet1>, <snippet2>, ...]],
            [<section2>, [<snippet1>, ...]],
            ...
        ]
    """

    snippets = []  # [section, [snippets]]

    wikicode = fast_parse(wikitext)
    if wikicode is None:
        # Fall back to full parsing if fast parsing fails
        wikicode = mwparserfromhell.parse(wikitext)
    sections = wikicode.get_sections(include_lead=True,
                                     include_headings=True,
                                     flat=True)

    for i, section in enumerate(sections):
        assert i == 0 or \
            isinstance(section.get(0), mwparserfromhell.nodes.heading.Heading)
        sectitle = unicode(section.get(0).title.strip()) if i != 0 else ''
        secsnippets = []
        snippets.append([sectitle, secsnippets])

        paragraphs = section.split('\n\n')
        for paragraph in paragraphs:
            # Invoking a string method on a Wikicode object returns a string,
            # so we need to parse it again :(
            wikicode = mwparserfromhell.parse(paragraph)

            blacklisted_tag_or_template = itertools.chain(
                (tag.tag in cfg.tags_blacklist
                 for tag in wikicode.filter_tags()),
                (matches_any(tpl, cfg.templates_blacklist)
                 for tpl in wikicode.filter_templates()),
            )
            if any(blacklisted_tag_or_template):
                continue

            snippet = cleanup_snippet(wikicode.strip_code())
            if '\n' in snippet:
                # Lists cause more 'paragraphs' to be generated
                paragraphs.extend(snippet.split('\n'))
                continue

            if CITATION_NEEDED_MARKER not in snippet:
                # marker may have been inside wiki markup
                continue

            usable_len = (len(snippet) -
                          (len(CITATION_NEEDED_MARKER) *
                           snippet.count(CITATION_NEEDED_MARKER)) -
                          (len(REF_MARKER) * snippet.count(REF_MARKER)))
            if usable_len > maxlen or usable_len < minlen:
                continue
            secsnippets.append(snippet)
    return snippets
示例#5
0
def extract_sections(wikitext, minlen=None, maxlen=None):
    """Extracts sections/subsections lacking citations.

    This function looks for sections of the article that are marked with any of
    the templates in `cfg.citation_needed_templates`. The output is meant to be
    converted into HTML by the Wikimedia API.

    The return value is a list of lists of the form:
        [
            [<section1>, [<subsection1>, <subsection2>, ...]],
            [<section2>, [<subsection1>, ...]],
            ...
        ]
    """

    snippets = []  # [section, [snippets]]
    sections = mwparserfromhell.parse(wikitext).get_sections(
        include_lead=True, include_headings=True, flat=True)

    i = 0
    while i < len(sections):
        section = sections[i]
        assert i == 0 or \
            isinstance(section.get(0), mwparserfromhell.nodes.heading.Heading)
        sectitle = unicode(section.get(0).title.strip()) if i != 0 else ''
        seclevel = section.get(0).level if i != 0 else float('inf')
        secsnippets = []
        snippets.append([sectitle, secsnippets])
        i += 1

        for tpl in section.filter_templates():
            if matches_any(tpl, cfg.citation_needed_templates):
                break
        else:
            # This section doesn't need references, move on to the next one
            continue

        # Consume the following sections until we find another one at the
        # same level (or the end of the wikicode). All of that needs references.
        nodes = section.nodes
        while i < len(sections):
            subsection = sections[i]
            if subsection.get(0).level <= seclevel:
                # not really a subsection
                break
            nodes.extend(subsection.nodes)
            i += 1

        if not nodes:
            # weird, looks like this section was really empty!
            continue

        wikicode = mwparserfromhell.parse(
            mwparserfromhell.wikicode.Wikicode(nodes).strip_code())

        # skip the templates that remained at the beginning and end
        empty_or_template = (lambda node: node == '' or isinstance(
            node, mwparserfromhell.nodes.template.Template) or re.match(
                '^\n+$', e(node)))
        nodes = list(itertools.dropwhile(empty_or_template, wikicode.nodes))
        wikicode.nodes = reversed(
            list(itertools.dropwhile(empty_or_template, nodes[::-1])))
        snippet = cleanup_snippet(unicode(wikicode))

        # Chop off some paragraphs at the end until we're at a reasonable
        # size, since we don't actually display the whole thing in the UI
        snippet = '\n\n'.join(p.strip(' ') for p in snippet.split('\n\n')[:10])
        if snippet:
            # We'll often end up with just a section header here, so hopefully
            # it will be smaller than the minimum size when converted to HTML.
            # FIXME: Maybe this can be detected?
            secsnippets.append(snippet)
    return snippets