예제 #1
0
 def _remove(node):
     nonlocal parent
     if parent is None:
         parent = get_parent_wikicode(wikicode, node)
     else:
         p = get_parent_wikicode(wikicode, node)
         if parent is not p:
             raise HeaderError
     if remove_from_parent is True:
         remove_and_squash(parent, node)
예제 #2
0
 def _remove(node):
     nonlocal parent
     if parent is None:
         parent = get_parent_wikicode(wikicode, node)
     else:
         p = get_parent_wikicode(wikicode, node)
         if parent is not p:
             raise HeaderError
     if remove_from_parent is True:
         remove_and_squash(parent, node)
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing page [[{}]]...".format(title))
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            # strip whitespace around the parameter, otherwise it is added to
            # the link and rendered incorrectly
            self.strip_whitespace(wikicode, template)

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing '%s'..." % title)
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
예제 #5
0
def localize_flag(wikicode, node, template_name):
    """
    If a ``node`` in ``wikicode`` is followed by a template with the same base
    name as ``template_name``, this function changes the adjacent template's
    name to ``template_name``.

    :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object
    :param node: a :py:class:`mwparserfromhell.nodes.Node` object
    :param str template_name: the name of the template flag, potentially
                              including a language name
    """
    parent = get_parent_wikicode(wikicode, node)
    adjacent = get_adjacent_node(parent, node, ignore_whitespace=True)

    if isinstance(adjacent, mwparserfromhell.nodes.Template):
        adjname = lang.detect_language(str(adjacent.name))[0]
        basename = lang.detect_language(template_name)[0]
        if canonicalize(adjname) == canonicalize(basename):
            adjacent.name = template_name
예제 #6
0
    def strip_whitespace(self, wikicode, template):
        """
        Strip whitespace around the first template parameter. If the template is
        surrounded by text, it is ensured that there is a space around the
        template `in the text` instead.

        :param :py:class:`mwparserfromhell.wikicode.Wikicode` wikicode:
            The root object containing ``template``.
        :param :py:class:`mwparserfromhell.nodes.Template` template:
            A `simple inline` template assumed to take exactly one parameter,
            which does not `disappear` in the rendered wikitext.
        """
        try:
            param = template.get(1)
        except ValueError:
            raise TemplateParametersError(template)

        parent = get_parent_wikicode(wikicode, template)
        index = parent.index(template)

        if param.value.startswith(" "):
            try:
                prev = parent.get(index - 1)
            except IndexError:
                prev = None
            if isinstance(prev, mwparserfromhell.nodes.text.Text):
                if not prev.endswith("\n") and not prev.endswith(" "):
                    prev.value += " "

        if param.value.endswith(" "):
            try:
                next_ = parent.get(index + 1)
            except IndexError:
                next_ = None
            if isinstance(next_, mwparserfromhell.nodes.text.Text):
                if not next_.startswith("\n") and not next_.startswith(" "):
                    next_.value = " " + next_.value

        template.name = str(template.name).strip()
        param.value = param.value.strip()
예제 #7
0
    def prepare_url(self, wikicode, extlink):
        # make a copy of the URL object (the skip_style_flags parameter is False,
        # so we will also properly parse URLs terminated by a wiki markup)
        url = mwparserfromhell.parse(str(extlink.url))

        # mwparserfromhell parses free URLs immediately followed by a template argument
        # (e.g. http://domain.tld/{{{1}}}) completely as one URL, so we can use this
        # to skip partial URLs inside templates
        if url.filter_arguments(recursive=True):
            return

        # mwparserfromhell parses free URLs immediately followed by a template
        # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL,
        # so we need to split it manually
        if "{{" in str(url):
            # back up original wikicode
            text_old = str(wikicode)

            url, rest = str(url).split("{{", maxsplit=1)
            rest = "{{" + rest
            url = mwparserfromhell.parse(url)
            # find the index of the template in extlink.url.nodes
            # (note that it may be greater than 1, e.g. when there are HTML entities)
            for idx in range(len(extlink.url.nodes)):
                if "".join(str(n) for n in extlink.url.nodes[idx:]) == rest:
                    break
            assert "".join(str(n)
                           for n in extlink.url.nodes[idx:]) == str(rest)
            # remove the template and everything after it from the extlink...
            # GOTCHA: the list shrinks during iteration, so we need to create a copy
            for node in list(extlink.url.nodes[idx:]):
                extlink.url.remove(node)
            # ...and insert it into the parent wikicode after the link
            parent = get_parent_wikicode(wikicode, extlink)
            parent.insert_after(extlink, rest)

            # make sure that this was a no-op
            text_new = str(wikicode)
            diff = diff_highlighted(text_old, text_new, "old", "new",
                                    "<utcnow>", "<utcnow>")
            assert text_old == text_new, "failed to fix parsing of templates after URL. The diff is:\n{}".format(
                diff)

        # replace HTML entities like "&#61" or "&Sigma;" with their unicode equivalents
        for entity in url.ifilter_html_entities(recursive=True):
            url.replace(entity, entity.normalize())

        try:
            # try to parse the URL - fails e.g. if port is not a number
            # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url
            url = urllib3.util.url.parse_url(str(url))
        except urllib3.exceptions.LocationParseError:
            logger.debug("skipped invalid URL: {}".format(url))
            return

        # skip unsupported schemes
        if url.scheme not in ["http", "https"]:
            logger.debug("skipped URL with unsupported scheme: {}".format(url))
            return
        # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run"
        # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 )
        if not url.host:
            logger.debug("skipped URL with empty host: {}".format(url))
            return
        # skip links with top-level domains only
        # (in practice they would be resolved relative to the local domain, on the wiki they are used
        # mostly as a pseudo-variable like http://server/path or http://mydomain/path)
        if "." not in url.host:
            logger.debug(
                "skipped URL with only top-level domain host: {}".format(url))
            return
        # skip links to localhost
        if url.host == "localhost" or url.host.endswith(".localhost"):
            logger.debug("skipped URL to localhost: {}".format(url))
            return
        # skip links to 127.*.*.* and ::1
        try:
            addr = ipaddress.ip_address(url.host)
            local_network = ipaddress.ip_network("127.0.0.0/8")
            if addr in local_network:
                logger.debug("skipped URL to local IP address: {}".format(url))
                return
        except ValueError:
            pass

        return url
예제 #8
0
    def check_extlink_status(self, wikicode, extlink):
        # make a copy of the URL object (the skip_style_flags parameter is False,
        # so we will also properly parse URLs terminated by a wiki markup)
        url = mwparserfromhell.parse(str(extlink.url))

        # mwparserfromhell parses free URLs immediately followed by a template
        # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL,
        # so we need to split it manually
        if "{{" in str(url):
            url, rest = str(url).split("{{", maxsplit=1)
            rest = "{{" + rest
            url = mwparserfromhell.parse(url)
            # remove everything after the real URL from the extlink...
            for node in extlink.url.nodes[1:]:
                extlink.url.remove(node)
            # ...and insert it into the parent wikicode after the link
            parent = get_parent_wikicode(wikicode, extlink)
            parent.insert_after(extlink, rest)

        # replace HTML entities like "&#61" or "&Sigma;" with their unicode equivalents
        for entity in url.ifilter_html_entities(recursive=True):
            url.replace(entity, entity.normalize())

        try:
            # try to parse the URL - fails e.g. if port is not a number
            # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url
            url = urllib3.util.url.parse_url(str(url))
        except urllib3.exceptions.LocationParseError:
            logger.debug("skipped invalid URL: {}".format(url))
            return

        # skip unsupported schemes
        if url.scheme not in ["http", "https"]:
            logger.debug("skipped URL with unsupported scheme: {}".format(url))
            return
        # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run"
        # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 )
        if not url.host:
            logger.debug("skipped URL with empty host: {}".format(url))
            return
        # skip links with top-level domains only
        # (in practice they would be resolved relative to the local domain, on the wiki they are used
        # mostly as a pseudo-variable like http://server/path or http://mydomain/path)
        if "." not in url.host:
            logger.debug(
                "skipped URL with only top-level domain host: {}".format(url))
            return
        # skip links to localhost
        if url.host == "localhost" or url.host.endswith(".localhost"):
            logger.debug("skipped URL to localhost: {}".format(url))
            return
        # skip links to 127.*.*.* and ::1
        try:
            addr = ipaddress.ip_address(url.host)
            local_network = ipaddress.ip_network("127.0.0.0/8")
            if addr in local_network:
                logger.debug("skipped URL to local IP address: {}".format(url))
                return
        except ValueError:
            pass
        # drop the fragment from the URL (to optimize caching)
        if url.fragment:
            url = urllib3.util.url.parse_url(
                url.url.rsplit("#", maxsplit=1)[0])

        logger.info("Checking link {} ...".format(extlink))

        status = self.check_url(url)
        if status is True:
            # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code)
            ensure_unflagged_by_template(wikicode, extlink, "Dead link")
        elif status is False:
            # TODO: handle bbs.archlinux.org (some links may require login)
            # TODO: handle links inside {{man|url=...}} properly
            # flag the link, but don't overwrite date and don't set status yet
            flag = ensure_flagged_by_template(wikicode,
                                              extlink,
                                              "Dead link",
                                              *self.deadlink_params,
                                              overwrite_parameters=False)
            # overwrite by default, but skip overwriting date when the status matches
            overwrite = True
            if flag.has("status"):
                status = flag.get("status").value
                if str(status) == str(self.cache_invalid_urls[url]):
                    overwrite = False
            if overwrite is True:
                # overwrite status as well as date
                flag.add("status", self.cache_invalid_urls[url], showkey=True)
                flag.add("1", self.deadlink_params[0], showkey=False)
                flag.add("2", self.deadlink_params[1], showkey=False)
                flag.add("3", self.deadlink_params[2], showkey=False)
        else:
            # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls
            logger.warning(
                "status check indeterminate for external link {}".format(
                    extlink))