Пример #1
0
 def _localized_template(self, template, lang="English"):
     assert (canonicalize(template) in self._alltemplates)
     localized = format_title(template, lang)
     if canonicalize(localized) in self._alltemplates:
         return localized
     # fall back to English
     return template
 def _localized_template(self, template, lang="English"):
     assert(canonicalize(template) in self._templates_list)
     localized = "{} ({})".format(template, lang) if lang != "English" else template
     if canonicalize(localized) in self._templates_list:
         return localized
     # fall back to English
     return template
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing page [[{}]]...".format(title))
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            # strip whitespace around the parameter, otherwise it is added to
            # the link and rendered incorrectly
            self.strip_whitespace(wikicode, template)

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
Пример #4
0
    def check_trivial(self, wikilink):
        """
        Perform trivial simplification, replace `[[Foo|foo]]` with `[[foo]]`.

        :param wikilink: instance of `mwparserfromhell.nodes.wikilink.Wikilink`
                         representing the link to be checked
        """
        # Wikicode.matches() ignores even the '#' character indicating relative links;
        # hence [[#foo|foo]] would be replaced with [[foo]]
        # Our canonicalize() function does exactly what we want and need.
        if wikilink.text is not None and canonicalize(wikilink.title) == canonicalize(wikilink.text):
            # title is mandatory, so the text becomes the title
            wikilink.title = wikilink.text
            wikilink.text = None
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing '%s'..." % title)
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
Пример #6
0
    def check_relative(self, src_title, wikilink, title):
        """
        Use relative links whenever possible. For example, links to sections such as
        `[[Foo#Bar]]` on a page `title` are replaced with `[[#Bar]]` whenever `Foo`
        redirects to or is equivalent to `title`.

        :param str src_title: the title of the page being checked
        :param wikilink: the link to be checked
        :type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink`
        :param title: the parsed :py:attr:`wikilink.title`
        :type title: :py:class:`mw.parser_helpers.title.Title`
        """
        if title.iwprefix or not title.sectionname:
            return
        # check if title is a redirect
        target = self.api.redirects.map.get(title.fullpagename)
        if target:
            _title = self.api.Title(target)
            _title.sectionname = title.sectionname
        else:
            _title = title

        if canonicalize(src_title) == _title.fullpagename:
            wikilink.title = "#" + _title.sectionname
            title.parse(wikilink.title)
Пример #7
0
    def check_relative(self, src_title, wikilink, title):
        """
        Use relative links whenever possible. For example, links to sections such as
        `[[Foo#Bar]]` on a page `title` are replaced with `[[#Bar]]` whenever `Foo`
        redirects to or is equivalent to `title`.

        :param str src_title: the title of the page being checked
        :param wikilink: the link to be checked
        :type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink`
        :param title: the parsed :py:attr:`wikilink.title`
        :type title: :py:class:`mw.parser_helpers.title.Title`
        """
        if title.iwprefix or not title.sectionname:
            return
        # check if title is a redirect
        target = self.api.redirects.map.get(title.fullpagename)
        if target:
            _title = self.api.Title(target)
            _title.sectionname = title.sectionname
        else:
            _title = title

        if canonicalize(src_title) == _title.fullpagename:
            wikilink.title = "#" + _title.sectionname
            title.parse(wikilink.title)
Пример #8
0
def localize_flag(wikicode, node, template_name):
    """
    If a ``node`` in ``wikicode`` is followed by a template with the same base
    name as ``template_name``, this function changes the adjacent template's
    name to ``template_name``.

    :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object
    :param node: a :py:class:`mwparserfromhell.nodes.Node` object
    :param str template_name: the name of the template flag, potentially
                              including a language name
    """
    parent = get_parent_wikicode(wikicode, node)
    adjacent = get_adjacent_node(parent, node, ignore_whitespace=True)

    if isinstance(adjacent, mwparserfromhell.nodes.Template):
        adjname = lang.detect_language(str(adjacent.name))[0]
        basename = lang.detect_language(template_name)[0]
        if canonicalize(adjname) == canonicalize(basename):
            adjacent.name = template_name
Пример #9
0
 def _title_from_langlink(self, langlink):
     langname = lang.langname_for_tag(langlink["lang"])
     title = lang.format_title(langlink["*"], langname)
     if lang.is_internal_tag(langlink["lang"]):
         title = canonicalize(title)
         # resolve redirects
         resolved = self.api.redirects.resolve(title)
         if resolved is not None:
             title = resolved.split("#", maxsplit=1)[0]
     return title
Пример #10
0
 def _title_from_langlink(self, langlink):
     langname = lang.langname_for_tag(langlink["lang"])
     title = lang.format_title(langlink["*"], langname)
     if lang.is_internal_tag(langlink["lang"]):
         title = canonicalize(title)
         # resolve redirects
         resolved = self.api.redirects.resolve(title)
         if resolved is not None:
             title = resolved.split("#", maxsplit=1)[0]
     return title
Пример #11
0
 def gen_nodes():
     for node_type, checkers in self.checkers.items():
         for node in wikicode.ifilter(recursive=True, forcetype=node_type):
             # skip templates that may be added or removed
             if node_type is mwparserfromhell.nodes.Template and \
                     any(canonicalize(node.name).startswith(prefix) for prefix in self.skip_templates):
                 continue
             # handle the node with all registered checkers
             for checker in checkers:
                 yield checker, node
 def _title_from_langlink(self, langlink):
     langname = lang.langname_for_tag(langlink["lang"])
     if langname == "English":
         title = langlink["*"]
     else:
         title = "{} ({})".format(langlink["*"], langname)
     if lang.is_internal_tag(langlink["lang"]):
         title = canonicalize(title)
         # resolve redirects
         if title in self.redirects:
             title = self.redirects[title].split("#", maxsplit=1)[0]
     return title
Пример #13
0
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
Пример #14
0
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
Пример #15
0
def get_header_parts(wikicode, magics=None, cats=None, langlinks=None, remove_from_parent=False):
    """
    According to Help:Style, the layout of the page should be as follows:

     1. Magic words (optional)
        (includes only {{DISPLAYTITLE:...}} and {{Lowercase title}})
     2. Categories
     3. Interlanguage links (if any)
     4. Article status templates (optional)
     5. Related articles box (optional)
     6. Preface or introduction
     7. Table of contents (automatic)
     8. Article-specific sections

    Only 1-3 are safe to be updated automatically. This function will extract
    the header parts from the wikicode and return them as tuple
    ``(parent, magics, cats, langlinks)``, where ``parent`` is an instance of
    :py:class:`mwparserfromhell.wikicode.Wikicode` containing all extracted
    elements. It is assumed that all header elements are children of the same
    parent node, otherwise :py:exc:`HeaderError` is raised.

    If ``remove_from_parent`` is ``True``, the extracted header elements  are
    also removed from the parent node and :py:func:`build_header` should be
    called to insert them back.

    The parameters ``magics``, ``cats`` and ``langlinks`` can be lists of
    objects (either string, wikicode or node) to be added to the header if not
    already present. These deduplication rules are applied:

      - supplied magic words take precedence over those present in wikicode
      - category links are considered duplicate when they point to the same
        category (e.g. [[Category:Foo]] is equivalent to [[category:foo]])
      - interlanguage links are considered duplicate when they have the same
        language tag (i.e. there can be only one interlanguage link for each
        language)

    The lists of magics and langlinks are sorted, the order of catlinks is
    preserved.
    """
    if magics is None:
        magics = []
    if cats is None:
        cats = []
    if langlinks is None:
        langlinks = []

    # make sure that we work with `Wikicode` objects
    magics = [mwparserfromhell.utils.parse_anything(item) for item in magics]
    cats = [mwparserfromhell.utils.parse_anything(item) for item in cats]
    langlinks = [mwparserfromhell.utils.parse_anything(item) for item in langlinks]

    parent = None

    def _prefix(title):
        if ":" not in title:
            return ""
        return title.split(":", 1)[0].strip()

    # check the parent wikicode object and remove node from it
    def _remove(node):
        nonlocal parent
        if parent is None:
            parent = get_parent_wikicode(wikicode, node)
        else:
            p = get_parent_wikicode(wikicode, node)
            if parent is not p:
                raise HeaderError
        if remove_from_parent is True:
            remove_and_squash(parent, node)

    def _add_to_magics(template):
        _remove(template)
        if not any(magic.get(0).name.matches(template.name) for magic in magics):
            magics.append(mwparserfromhell.utils.parse_anything(template))

    def _add_to_cats(catlink):
        # TODO: non-duplicate "typos" are still ignored -- is this important enough to handle it?
        if not any(cat.get(0).title.matches(catlink.title) for cat in cats):
            # only remove from wikicode if we actually append to cats (duplicate category
            # links are considered typos, e.g. [[Category:foo]] instead of [[:Category:foo]],
            # which are quite common)
            _remove(catlink)
            cats.append(mwparserfromhell.utils.parse_anything(catlink))

    def _add_to_langlinks(langlink):
        # always remove langlinks to handle renaming of pages
        # (typos such as [[en:Main page]] in text are quite rare)
        _remove(langlink)
        if not any(_prefix(link.get(0).title).lower() == _prefix(langlink.title).lower() for link in langlinks):
            # not all tags work as interlanguage links
            if lang.is_interlanguage_tag(_prefix(langlink.title).lower()):
                langlinks.append(mwparserfromhell.utils.parse_anything(langlink))

    # count extracted header elements
    _extracted_count = 0

    for template in wikicode.filter_templates():
        _pure, _ = lang.detect_language(str(template.name))
        if canonicalize(template.name) == "Lowercase title" or _prefix(template.name) == "DISPLAYTITLE" or _pure in ["Template", "Template:Template"]:
            _add_to_magics(template)
            _extracted_count += 1

    for link in wikicode.filter_wikilinks():
        prefix = _prefix(link.title).lower()
        if prefix == "category":
            _add_to_cats(link)
            _extracted_count += 1
        elif prefix in lang.get_language_tags():
            _add_to_langlinks(link)
            _extracted_count += 1

    magics.sort()
    langlinks.sort()

    if parent is None:
        if _extracted_count > 0:
            # this indicates parser error (e.g. unclosed <div> tags)
            raise HeaderError("no parent Wikicode object")
        else:
            # for pages without any header elements
            parent = wikicode

    return parent, magics, cats, langlinks
Пример #16
0
 def _page_exists(self, title):
     # self.allpages does not include redirects, but that's fine...
     return canonicalize(title) in set(page["title"] for page in self.allpages)
Пример #17
0
 def _page_exists(self, title):
     # self.allpages does not include redirects, but that's fine...
     return canonicalize(title) in set(page["title"]
                                       for page in self.allpages)
Пример #18
0
def get_header_parts(wikicode,
                     magics=None,
                     cats=None,
                     langlinks=None,
                     remove_from_parent=False):
    """
    According to Help:Style, the layout of the page should be as follows:

     1. Magic words (optional)
        (includes only {{DISPLAYTITLE:...}} and {{Lowercase title}})
     2. Categories
     3. Interlanguage links (if any)
     4. Article status templates (optional)
     5. Related articles box (optional)
     6. Preface or introduction
     7. Table of contents (automatic)
     8. Article-specific sections

    Only 1-3 are safe to be updated automatically. This function will extract
    the header parts from the wikicode and return them as tuple
    ``(parent, magics, cats, langlinks)``, where ``parent`` is an instance of
    :py:class:`mwparserfromhell.wikicode.Wikicode` containing all extracted
    elements. It is assumed that all header elements are children of the same
    parent node, otherwise :py:exc:`HeaderError` is raised.

    If ``remove_from_parent`` is ``True``, the extracted header elements  are
    also removed from the parent node and :py:func:`build_header` should be
    called to insert them back.

    The parameters ``magics``, ``cats`` and ``langlinks`` can be lists of
    objects (either string, wikicode or node) to be added to the header if not
    already present. These deduplication rules are applied:

      - supplied magic words take precedence over those present in wikicode
      - category links are considered duplicate when they point to the same
        category (e.g. [[Category:Foo]] is equivalent to [[category:foo]])
      - interlanguage links are considered duplicate when they have the same
        language tag (i.e. there can be only one interlanguage link for each
        language)

    The lists of magics and langlinks are sorted, the order of catlinks is
    preserved.
    """
    if magics is None:
        magics = []
    if cats is None:
        cats = []
    if langlinks is None:
        langlinks = []

    # make sure that we work with `Wikicode` objects
    magics = [mwparserfromhell.utils.parse_anything(item) for item in magics]
    cats = [mwparserfromhell.utils.parse_anything(item) for item in cats]
    langlinks = [
        mwparserfromhell.utils.parse_anything(item) for item in langlinks
    ]

    parent = None

    def _prefix(title):
        if ":" not in title:
            return ""
        return title.split(":", 1)[0].strip()

    # check the parent wikicode object and remove node from it
    def _remove(node):
        nonlocal parent
        if parent is None:
            parent = get_parent_wikicode(wikicode, node)
        else:
            p = get_parent_wikicode(wikicode, node)
            if parent is not p:
                raise HeaderError
        if remove_from_parent is True:
            remove_and_squash(parent, node)

    def _add_to_magics(template):
        _remove(template)
        if not any(
                magic.get(0).name.matches(template.name) for magic in magics):
            magics.append(mwparserfromhell.utils.parse_anything(template))

    def _add_to_cats(catlink):
        # TODO: non-duplicate "typos" are still ignored -- is this important enough to handle it?
        if not any(cat.get(0).title.matches(catlink.title) for cat in cats):
            # only remove from wikicode if we actually append to cats (duplicate category
            # links are considered typos, e.g. [[Category:foo]] instead of [[:Category:foo]],
            # which are quite common)
            _remove(catlink)
            cats.append(mwparserfromhell.utils.parse_anything(catlink))

    def _add_to_langlinks(langlink):
        # always remove langlinks to handle renaming of pages
        # (typos such as [[en:Main page]] in text are quite rare)
        _remove(langlink)
        if not any(
                _prefix(link.get(0).title).lower() == _prefix(
                    langlink.title).lower() for link in langlinks):
            langlinks.append(mwparserfromhell.utils.parse_anything(langlink))

    def _is_in_includeonly(node):
        ancestors = wikicode.get_ancestors(node)
        for a in ancestors:
            if isinstance(a, mwparserfromhell.nodes.tag.Tag) and a.tag.matches(
                    "includeonly"):
                return True
        return False

    # count extracted header elements
    _extracted_count = 0

    for template in wikicode.filter_templates():
        if _is_in_includeonly(template):
            continue
        _pure, _ = lang.detect_language(str(template.name))
        if canonicalize(template.name) == "Lowercase title" or _prefix(
                template.name) == "DISPLAYTITLE" or _pure in [
                    "Template", "Template:Template"
                ]:
            _add_to_magics(template)
            _extracted_count += 1

    for link in wikicode.filter_wikilinks():
        if _is_in_includeonly(link):
            continue
        prefix = _prefix(link.title).lower()
        if prefix == "category":
            _add_to_cats(link)
            _extracted_count += 1
        # GOTCHA: not all tags work as interlanguage links
        elif lang.is_interlanguage_tag(prefix):
            _add_to_langlinks(link)
            _extracted_count += 1

    magics.sort()
    langlinks.sort()

    if parent is None:
        if _extracted_count > 0:
            # this indicates parser error (e.g. unclosed <div> tags)
            raise HeaderError("no parent Wikicode object")
        else:
            # for pages without any header elements
            parent = wikicode

    return parent, magics, cats, langlinks