def _remove(node): nonlocal parent if parent is None: parent = get_parent_wikicode(wikicode, node) else: p = get_parent_wikicode(wikicode, node) if parent is not p: raise HeaderError if remove_from_parent is True: remove_and_squash(parent, node)
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing page [[{}]]...".format(title)) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue # strip whitespace around the parameter, otherwise it is added to # the link and rendered incorrectly self.strip_whitespace(wikicode, template) hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing '%s'..." % title) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def localize_flag(wikicode, node, template_name): """ If a ``node`` in ``wikicode`` is followed by a template with the same base name as ``template_name``, this function changes the adjacent template's name to ``template_name``. :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object :param node: a :py:class:`mwparserfromhell.nodes.Node` object :param str template_name: the name of the template flag, potentially including a language name """ parent = get_parent_wikicode(wikicode, node) adjacent = get_adjacent_node(parent, node, ignore_whitespace=True) if isinstance(adjacent, mwparserfromhell.nodes.Template): adjname = lang.detect_language(str(adjacent.name))[0] basename = lang.detect_language(template_name)[0] if canonicalize(adjname) == canonicalize(basename): adjacent.name = template_name
def strip_whitespace(self, wikicode, template): """ Strip whitespace around the first template parameter. If the template is surrounded by text, it is ensured that there is a space around the template `in the text` instead. :param :py:class:`mwparserfromhell.wikicode.Wikicode` wikicode: The root object containing ``template``. :param :py:class:`mwparserfromhell.nodes.Template` template: A `simple inline` template assumed to take exactly one parameter, which does not `disappear` in the rendered wikitext. """ try: param = template.get(1) except ValueError: raise TemplateParametersError(template) parent = get_parent_wikicode(wikicode, template) index = parent.index(template) if param.value.startswith(" "): try: prev = parent.get(index - 1) except IndexError: prev = None if isinstance(prev, mwparserfromhell.nodes.text.Text): if not prev.endswith("\n") and not prev.endswith(" "): prev.value += " " if param.value.endswith(" "): try: next_ = parent.get(index + 1) except IndexError: next_ = None if isinstance(next_, mwparserfromhell.nodes.text.Text): if not next_.startswith("\n") and not next_.startswith(" "): next_.value = " " + next_.value template.name = str(template.name).strip() param.value = param.value.strip()
def prepare_url(self, wikicode, extlink): # make a copy of the URL object (the skip_style_flags parameter is False, # so we will also properly parse URLs terminated by a wiki markup) url = mwparserfromhell.parse(str(extlink.url)) # mwparserfromhell parses free URLs immediately followed by a template argument # (e.g. http://domain.tld/{{{1}}}) completely as one URL, so we can use this # to skip partial URLs inside templates if url.filter_arguments(recursive=True): return # mwparserfromhell parses free URLs immediately followed by a template # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL, # so we need to split it manually if "{{" in str(url): # back up original wikicode text_old = str(wikicode) url, rest = str(url).split("{{", maxsplit=1) rest = "{{" + rest url = mwparserfromhell.parse(url) # find the index of the template in extlink.url.nodes # (note that it may be greater than 1, e.g. when there are HTML entities) for idx in range(len(extlink.url.nodes)): if "".join(str(n) for n in extlink.url.nodes[idx:]) == rest: break assert "".join(str(n) for n in extlink.url.nodes[idx:]) == str(rest) # remove the template and everything after it from the extlink... # GOTCHA: the list shrinks during iteration, so we need to create a copy for node in list(extlink.url.nodes[idx:]): extlink.url.remove(node) # ...and insert it into the parent wikicode after the link parent = get_parent_wikicode(wikicode, extlink) parent.insert_after(extlink, rest) # make sure that this was a no-op text_new = str(wikicode) diff = diff_highlighted(text_old, text_new, "old", "new", "<utcnow>", "<utcnow>") assert text_old == text_new, "failed to fix parsing of templates after URL. The diff is:\n{}".format( diff) # replace HTML entities like "=" or "Σ" with their unicode equivalents for entity in url.ifilter_html_entities(recursive=True): url.replace(entity, entity.normalize()) try: # try to parse the URL - fails e.g. if port is not a number # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url url = urllib3.util.url.parse_url(str(url)) except urllib3.exceptions.LocationParseError: logger.debug("skipped invalid URL: {}".format(url)) return # skip unsupported schemes if url.scheme not in ["http", "https"]: logger.debug("skipped URL with unsupported scheme: {}".format(url)) return # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run" # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 ) if not url.host: logger.debug("skipped URL with empty host: {}".format(url)) return # skip links with top-level domains only # (in practice they would be resolved relative to the local domain, on the wiki they are used # mostly as a pseudo-variable like http://server/path or http://mydomain/path) if "." not in url.host: logger.debug( "skipped URL with only top-level domain host: {}".format(url)) return # skip links to localhost if url.host == "localhost" or url.host.endswith(".localhost"): logger.debug("skipped URL to localhost: {}".format(url)) return # skip links to 127.*.*.* and ::1 try: addr = ipaddress.ip_address(url.host) local_network = ipaddress.ip_network("127.0.0.0/8") if addr in local_network: logger.debug("skipped URL to local IP address: {}".format(url)) return except ValueError: pass return url
def check_extlink_status(self, wikicode, extlink): # make a copy of the URL object (the skip_style_flags parameter is False, # so we will also properly parse URLs terminated by a wiki markup) url = mwparserfromhell.parse(str(extlink.url)) # mwparserfromhell parses free URLs immediately followed by a template # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL, # so we need to split it manually if "{{" in str(url): url, rest = str(url).split("{{", maxsplit=1) rest = "{{" + rest url = mwparserfromhell.parse(url) # remove everything after the real URL from the extlink... for node in extlink.url.nodes[1:]: extlink.url.remove(node) # ...and insert it into the parent wikicode after the link parent = get_parent_wikicode(wikicode, extlink) parent.insert_after(extlink, rest) # replace HTML entities like "=" or "Σ" with their unicode equivalents for entity in url.ifilter_html_entities(recursive=True): url.replace(entity, entity.normalize()) try: # try to parse the URL - fails e.g. if port is not a number # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url url = urllib3.util.url.parse_url(str(url)) except urllib3.exceptions.LocationParseError: logger.debug("skipped invalid URL: {}".format(url)) return # skip unsupported schemes if url.scheme not in ["http", "https"]: logger.debug("skipped URL with unsupported scheme: {}".format(url)) return # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run" # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 ) if not url.host: logger.debug("skipped URL with empty host: {}".format(url)) return # skip links with top-level domains only # (in practice they would be resolved relative to the local domain, on the wiki they are used # mostly as a pseudo-variable like http://server/path or http://mydomain/path) if "." not in url.host: logger.debug( "skipped URL with only top-level domain host: {}".format(url)) return # skip links to localhost if url.host == "localhost" or url.host.endswith(".localhost"): logger.debug("skipped URL to localhost: {}".format(url)) return # skip links to 127.*.*.* and ::1 try: addr = ipaddress.ip_address(url.host) local_network = ipaddress.ip_network("127.0.0.0/8") if addr in local_network: logger.debug("skipped URL to local IP address: {}".format(url)) return except ValueError: pass # drop the fragment from the URL (to optimize caching) if url.fragment: url = urllib3.util.url.parse_url( url.url.rsplit("#", maxsplit=1)[0]) logger.info("Checking link {} ...".format(extlink)) status = self.check_url(url) if status is True: # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code) ensure_unflagged_by_template(wikicode, extlink, "Dead link") elif status is False: # TODO: handle bbs.archlinux.org (some links may require login) # TODO: handle links inside {{man|url=...}} properly # flag the link, but don't overwrite date and don't set status yet flag = ensure_flagged_by_template(wikicode, extlink, "Dead link", *self.deadlink_params, overwrite_parameters=False) # overwrite by default, but skip overwriting date when the status matches overwrite = True if flag.has("status"): status = flag.get("status").value if str(status) == str(self.cache_invalid_urls[url]): overwrite = False if overwrite is True: # overwrite status as well as date flag.add("status", self.cache_invalid_urls[url], showkey=True) flag.add("1", self.deadlink_params[0], showkey=False) flag.add("2", self.deadlink_params[1], showkey=False) flag.add("3", self.deadlink_params[2], showkey=False) else: # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls logger.warning( "status check indeterminate for external link {}".format( extlink))