def update_wikilink(self, wikicode, wikilink, src_title, summary_parts): if str(wikilink) in self.void_update_cache: logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink)) return title = self.api.Title(wikilink.title) # skip interlanguage links (handled by interlanguage.py) if title.iwprefix in self.api.site.interlanguagemap.keys(): return summary = get_edit_checker(wikicode, summary_parts) with summary("simplification and beautification of wikilinks"): # beautify if urldecoded # FIXME: make it implicit - it does not always propagate from the Title class if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE): # handle links with leading colon properly wikilink.title = title.leading_colon + str(title) # FIXME: should be done in the Title class # the anchor is dot-encoded, but percent-encoding wors for links too # and is even rendered nicely wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D") self.collapse_whitespace_pipe(wikilink) self.check_trivial(wikilink, title) self.check_relative(src_title, wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) self.check_redirect_capitalization(wikilink, title) # reparse the title, the redirect checks might change it non-equivalently title = self.api.Title(wikilink.title) self.check_displaytitle(wikilink, title) with summary("fixed section fragments"): anchor_result = self.check_anchor(src_title, wikilink, title) if anchor_result is False: with summary("flagged broken section links"): ensure_flagged_by_template(wikicode, wikilink, "Broken section link") else: with summary("unflagged working section links"): ensure_unflagged_by_template(wikicode, wikilink, "Broken section link") with summary("simplification and beautification of wikilinks"): # partial second pass self.check_trivial(wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]' self.collapse_whitespace(wikicode, wikilink) # cache context-less, correct wikilinks that don't need any update if title.pagename and len(summary_parts) == 0 and anchor_result is True: self.void_update_cache.add(str(wikilink))
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing page [[{}]]...".format(title)) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue # strip whitespace around the parameter, otherwise it is added to # the link and rendered incorrectly self.strip_whitespace(wikicode, template) hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) # first unflag since the localized template might change ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True) # flag with a localized template and hint flag = self.get_localized_template("Broken package link", lang) ensure_flagged_by_template(wikicode, template, flag, hint, overwrite_parameters=True) else: ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True) return wikicode
def check_extlink_status(self, wikicode, extlink, src_title): with self.lock_wikicode: url = self.prepare_url(wikicode, extlink) if url is None: return logger.info("Checking link {} ...".format(extlink)) status = self.check_url(url) with self.lock_wikicode: if status is True: # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code) ensure_unflagged_by_template(wikicode, extlink, "Dead link", match_only_prefix=True) elif status is False: # TODO: handle bbs.archlinux.org (some links may require login) # TODO: handle links inside {{man|url=...}} properly # first replace the existing template (if any) with a translated version flag = self.get_localized_template( "Dead link", lang.detect_language(src_title)[1]) localize_flag(wikicode, extlink, flag) # flag the link, but don't overwrite date and don't set status yet flag = ensure_flagged_by_template(wikicode, extlink, flag, *self.deadlink_params, overwrite_parameters=False) # drop the fragment from the URL before looking into the cache if url.fragment: url = urllib3.util.url.parse_url( url.url.rsplit("#", maxsplit=1)[0]) # overwrite by default, but skip overwriting date when the status matches overwrite = True if flag.has("status"): status = flag.get("status").value if str(status) == str(self.cache_invalid_urls[url]): overwrite = False if overwrite is True: # overwrite status as well as date flag.add("status", self.cache_invalid_urls[url], showkey=True) flag.add("1", self.deadlink_params[0], showkey=False) flag.add("2", self.deadlink_params[1], showkey=False) flag.add("3", self.deadlink_params[2], showkey=False) else: # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls logger.warning( "status check indeterminate for external link {}".format( extlink))
def update_man_template(self, wikicode, template): if template.name.lower() != "man": return now = datetime.datetime.utcnow() deadlink_params = [now.year, now.month, now.day] deadlink_params = ["{:02d}".format(i) for i in deadlink_params] if not template.has(1) or not template.has(2, ignore_empty=True): ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False) return if template.get(1).value.strip(): url = self.url_template.format(section=template.get(1).value.strip(), pagename=queryencode(template.get(2).value.strip())) else: url = self.url_template_nosection.format(pagename=queryencode(template.get(2).value.strip())) if template.has(3): url += "#{}".format(queryencode(template.get(3).value.strip())) if template.has("url"): explicit_url = template.get("url").value.strip() else: explicit_url = None def check_url(url): if url.startswith("ftp://"): logger.error("The FTP protocol is not supported by the requests module. URL: {}".format(url)) return True if url in self.cache_valid_urls: return True elif url in self.cache_invalid_urls: return False response = self.session.get(url, timeout=self.timeout) if response.status_code == 200: # heuristics to get the missing section (redirect from some_page to some_page.1) # WARNING: if the manual exists in multiple sections, the first one might not be the best if len(response.history) == 1 and response.url.startswith(url + "."): # template parameter 1= should be empty assert not template.has(1, ignore_empty=True) template.add(1, response.url[len(url) + 1:]) self.cache_valid_urls.add(response.url) return True else: self.cache_valid_urls.add(url) return True elif response.status_code >= 400: self.cache_invalid_urls.add(url) return False else: raise NotImplementedError("Unexpected status code {} for man page URL: {}".format(response.status_code, url)) # check if the template parameters form a valid URL if check_url(url): ensure_unflagged_by_template(wikicode, template, "Dead link") # remove explicit url= parameter - not necessary if explicit_url is not None: template.remove("url") elif explicit_url is None: ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False) elif explicit_url != "": if check_url(explicit_url): ensure_unflagged_by_template(wikicode, template, "Dead link") else: ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
def update_man_template(self, wikicode, template): if template.name.lower() != "man": return now = datetime.datetime.utcnow() deadlink_params = [now.year, now.month, now.day] deadlink_params = ["{:02d}".format(i) for i in deadlink_params] if not template.has(1) or not template.has(2, ignore_empty=True): ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False) return url = self.url_prefix if template.has("pkg"): url += template.get("pkg").value.strip() + "/" url += queryencode(template.get(2).value.strip()) if template.get(1).value.strip(): url += "." + template.get(1).value.strip() if template.has(3): url += "#{}".format(queryencode(template.get(3).value.strip())) if template.has("url"): explicit_url = template.get("url").value.strip() else: explicit_url = None def check_url(url): if url.startswith("ftp://"): logger.error( "The FTP protocol is not supported by the requests module. URL: {}" .format(url)) return True if url in self.cache_valid_urls: return True elif url in self.cache_invalid_urls: return False response = self.session.get(url, timeout=self.timeout) if response.status_code == 200: # heuristics to get the missing section (redirect from some_page to some_page.1) # WARNING: if the manual exists in multiple sections, the first one might not be the best if len(response.history) == 1 and response.url.startswith(url + "."): # template parameter 1= should be empty assert not template.has(1, ignore_empty=True) template.add(1, response.url[len(url) + 1:]) self.cache_valid_urls.add(response.url) return True else: self.cache_valid_urls.add(url) return True elif response.status_code >= 400: self.cache_invalid_urls.add(url) return False else: raise NotImplementedError( "Unexpected status code {} for man page URL: {}".format( response.status_code, url)) # check if the template parameters form a valid URL if check_url(url): ensure_unflagged_by_template(wikicode, template, "Dead link") # remove explicit url= parameter - not necessary if explicit_url is not None: template.remove("url") elif explicit_url is None: ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False) elif explicit_url != "": if check_url(explicit_url): ensure_unflagged_by_template(wikicode, template, "Dead link") else: ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
def check_extlink_status(self, wikicode, extlink): # make a copy of the URL object (the skip_style_flags parameter is False, # so we will also properly parse URLs terminated by a wiki markup) url = mwparserfromhell.parse(str(extlink.url)) # mwparserfromhell parses free URLs immediately followed by a template # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL, # so we need to split it manually if "{{" in str(url): url, rest = str(url).split("{{", maxsplit=1) rest = "{{" + rest url = mwparserfromhell.parse(url) # remove everything after the real URL from the extlink... for node in extlink.url.nodes[1:]: extlink.url.remove(node) # ...and insert it into the parent wikicode after the link parent = get_parent_wikicode(wikicode, extlink) parent.insert_after(extlink, rest) # replace HTML entities like "=" or "Σ" with their unicode equivalents for entity in url.ifilter_html_entities(recursive=True): url.replace(entity, entity.normalize()) try: # try to parse the URL - fails e.g. if port is not a number # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url url = urllib3.util.url.parse_url(str(url)) except urllib3.exceptions.LocationParseError: logger.debug("skipped invalid URL: {}".format(url)) return # skip unsupported schemes if url.scheme not in ["http", "https"]: logger.debug("skipped URL with unsupported scheme: {}".format(url)) return # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run" # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 ) if not url.host: logger.debug("skipped URL with empty host: {}".format(url)) return # skip links with top-level domains only # (in practice they would be resolved relative to the local domain, on the wiki they are used # mostly as a pseudo-variable like http://server/path or http://mydomain/path) if "." not in url.host: logger.debug( "skipped URL with only top-level domain host: {}".format(url)) return # skip links to localhost if url.host == "localhost" or url.host.endswith(".localhost"): logger.debug("skipped URL to localhost: {}".format(url)) return # skip links to 127.*.*.* and ::1 try: addr = ipaddress.ip_address(url.host) local_network = ipaddress.ip_network("127.0.0.0/8") if addr in local_network: logger.debug("skipped URL to local IP address: {}".format(url)) return except ValueError: pass # drop the fragment from the URL (to optimize caching) if url.fragment: url = urllib3.util.url.parse_url( url.url.rsplit("#", maxsplit=1)[0]) logger.info("Checking link {} ...".format(extlink)) status = self.check_url(url) if status is True: # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code) ensure_unflagged_by_template(wikicode, extlink, "Dead link") elif status is False: # TODO: handle bbs.archlinux.org (some links may require login) # TODO: handle links inside {{man|url=...}} properly # flag the link, but don't overwrite date and don't set status yet flag = ensure_flagged_by_template(wikicode, extlink, "Dead link", *self.deadlink_params, overwrite_parameters=False) # overwrite by default, but skip overwriting date when the status matches overwrite = True if flag.has("status"): status = flag.get("status").value if str(status) == str(self.cache_invalid_urls[url]): overwrite = False if overwrite is True: # overwrite status as well as date flag.add("status", self.cache_invalid_urls[url], showkey=True) flag.add("1", self.deadlink_params[0], showkey=False) flag.add("2", self.deadlink_params[1], showkey=False) flag.add("3", self.deadlink_params[2], showkey=False) else: # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls logger.warning( "status check indeterminate for external link {}".format( extlink))
def update_man_template(self, wikicode, template, src_title): if template.name.lower() != "man": return src_lang = lang.detect_language(src_title)[1] now = datetime.datetime.utcnow() deadlink_params = [now.year, now.month, now.day] deadlink_params = ["{:02d}".format(i) for i in deadlink_params] if not template.has(1) or not template.has(2, ignore_empty=True): # first replace the existing template (if any) with a translated version flag = self.get_localized_template("Dead link", src_lang) localize_flag(wikicode, template, flag) # flag with the correct translated template ensure_flagged_by_template(wikicode, template, flag, *deadlink_params, overwrite_parameters=False) return url = self.man_url_prefix if template.has("pkg"): url += template.get("pkg").value.strip() + "/" url += urlencode(template.get(2).value.strip()) # template parameter 1= should be empty if not template.has(1, ignore_empty=True): response = self.session.head(url, timeout=self.timeout, allow_redirects=True) # heuristics to get the missing section (redirect from some_page to some_page.1) # WARNING: if the manual exists in multiple sections, the first one might not be the best if response.status_code == 200 and len( response.history) == 1 and response.url.startswith(url + "."): template.add(1, response.url[len(url) + 1:]) if template.get(1).value.strip(): url += "." + template.get(1).value.strip() if template.has(3): url += "#{}".format( urlencode(anchorencode(template.get(3).value.strip()))) if template.has("url"): explicit_url = template.get("url").value.strip() else: explicit_url = None # check if the template parameters form a valid URL if self.check_url(url): ensure_unflagged_by_template(wikicode, template, "Dead link", match_only_prefix=True) # remove explicit url= parameter - not necessary if explicit_url is not None: template.remove("url") elif explicit_url is None: # first replace the existing template (if any) with a translated version flag = self.get_localized_template("Dead link", src_lang) localize_flag(wikicode, template, flag) # flag with the correct translated template ensure_flagged_by_template(wikicode, template, flag, *deadlink_params, overwrite_parameters=False) elif explicit_url != "": if self.check_url(explicit_url): ensure_unflagged_by_template(wikicode, template, "Dead link", match_only_prefix=True) else: # first replace the existing template (if any) with a translated version flag = self.get_localized_template("Dead link", src_lang) localize_flag(wikicode, template, flag) # flag with the correct translated template ensure_flagged_by_template(wikicode, template, flag, *deadlink_params, overwrite_parameters=False)