def valid_sectionname(db, title):
    """
    Checks if the ``sectionname`` property of given title is valid, i.e. if a
    corresponding section exists on a page with given title.

    .. note::
        Validation is limited to pages in the Main namespace for easier access
        to the cache; anchors on other pages are considered to be always valid.

    :param ws.db.database.Database db: database object
    :param title: parsed title of the wikilink to be checked
    :type title: ws.parser_helpers.title.Title
    :returns: ``True`` if the anchor corresponds to an existing section
    """
    # we can't check interwiki links
    if title.iwprefix:
        return True

    # empty sectionname is always valid
    if title.sectionname == "":
        return True

    # get list of valid anchors
    result = db.query(titles=title.fullpagename, prop="sections", secprop={"anchor"})
    page = next(result)
    anchors = [section["anchor"] for section in page.get("sections", [])]

    # encode the given anchor and validate
    return dotencode(title.sectionname) in anchors
def valid_sectionname(db, title):
    """
    Checks if the ``sectionname`` property of given title is valid, i.e. if a
    corresponding section exists on a page with given title.

    .. note::
        Validation is limited to pages in the Main namespace for easier access
        to the cache; anchors on other pages are considered to be always valid.

    :param ws.db.database.Database db: database object
    :param title: parsed title of the wikilink to be checked
    :type title: ws.parser_helpers.title.Title
    :returns: ``True`` if the anchor corresponds to an existing section
    """
    # we can't check interwiki links
    if title.iwprefix:
        return True

    # empty sectionname is always valid
    if title.sectionname == "":
        return True

    # get list of valid anchors
    result = db.query(titles=title.fullpagename,
                      prop="sections",
                      secprop={"anchor"})
    page = next(result)
    anchors = [section["anchor"] for section in page.get("sections", [])]

    # encode the given anchor and validate
    return dotencode(title.sectionname) in anchors
def valid_sectionname(title, pages, wrapped_titles):
    """
    Checks if the ``sectionname`` property of given title is valid, i.e. if a
    corresponding section exists on a page with given title.

    .. note::
        Validation is limited to pages in the Main namespace for easier access
        to the cache; anchors on other pages are considered to be always valid.

    :param title: parsed title of the wikilink to be checked
    :type title: ws.parser_helpers.title.Title
    :returns: ``True`` if the anchor corresponds to an existing section
    """
    # we can't check interwiki links
    if title.iwprefix:
        return True

    # TODO: limitation of the cache, we can easily check only the main namespace
    if title.namespace != "":
        return True

    # empty sectionname is always valid
    if title.sectionname == "":
        return True

    page = ws.utils.bisect_find(pages, title.fullpagename, index_list=wrapped_titles)
    text = page["revisions"][0]["*"]

    # get list of valid anchors
    anchors = get_anchors(get_section_headings(text))

    # encode the given anchor and validate
    return dotencode(title.sectionname) in anchors
예제 #4
0
    def check_anchor(self, src_title, wikilink, title):
        """
        :returns:
            ``True`` if the anchor is correct or has been corrected, ``False``
            if it is definitely broken, ``None`` if it can't be checked at all
            or the check was indecisive and a warning/error has been printed to
            the log.
        """
        # TODO: beware of https://phabricator.wikimedia.org/T20431

        # we can't check interwiki links
        if title.iwprefix:
            return None

        # empty sectionname is always valid
        if title.sectionname == "":
            return None

        # determine target page
        _target_title = title.make_absolute(src_title)

        # skip links to special pages (e.g. [[Special:Preferences#mw-prefsection-rc]])
        if _target_title.namespacenumber < 0:
            return None

        # resolve redirects
        anchor_on_redirect_to_section = False
        if _target_title.fullpagename in self.api.redirects.map:
            _target_title = self.api.Title(
                self.api.redirects.resolve(_target_title.fullpagename))
            if _target_title.sectionname:
                logger.warning(
                    "warning: section fragment placed on a redirect to possibly different section: {}"
                    .format(wikilink))
                anchor_on_redirect_to_section = True

        # get lists of section headings and anchors
        _result = self.db.query(titles=_target_title.fullpagename,
                                prop="sections",
                                secprop={"title", "anchor"})
        _result = list(_result)
        assert len(_result) == 1
        if "missing" in _result[0]:
            logger.error(
                "could not find content of page: '{}' (wikilink {})".format(
                    _target_title.fullpagename, wikilink))
            return None
        headings = [
            section["title"] for section in _result[0].get("sections", [])
        ]
        anchors = [
            section["anchor"] for section in _result[0].get("sections", [])
        ]

        if len(headings) == 0:
            logger.warning(
                "wikilink with broken section fragment: {}".format(wikilink))
            return False

        anchor = dotencode(title.sectionname)
        needs_fix = True

        # handle double-anchor redirects first
        if anchor_on_redirect_to_section is True:
            if anchor in anchors:
                return True
            else:
                return False

        # try exact match first
        if anchor in anchors:
            needs_fix = False
        # otherwise try case-insensitive match to detect differences in capitalization
        elif self.interactive is True:
            # FIXME: first detect section renaming properly, fuzzy search should be only the last resort to deal with typos and such
            ranks = get_ranks(anchor, anchors)
            ranks = list(filter(lambda rank: rank[1] >= 0.8, ranks))
            if len(ranks) == 1 or (len(ranks) >= 2
                                   and ranks[0][1] - ranks[1][1] > 0.2):
                logger.debug(
                    "wikilink {}: replacing anchor '{}' with '{}' on similarity level {}"
                    .format(wikilink, anchor, ranks[0][0], ranks[0][1]))
                anchor = ranks[0][0]
            elif len(ranks) > 1:
                logger.debug(
                    "skipping {}: multiple feasible anchors per similarity ratio: {}"
                    .format(wikilink, ranks))
                return False
            else:
                logger.warning(
                    "wikilink with broken section fragment: {}".format(
                        wikilink))
                return False
        else:
            # FIXME: anchors with encoded characters like '[' or ']' are not handled properly in non-interactive mode - links get flagged as broken, although they work
            # (e.g. [[Systemd-networkd#%5BNetDev%5D section|systemd-networkd]] - linked from [[systemd-timesyncd]])
            logger.warning(
                "wikilink with broken section fragment: {}".format(wikilink))
            return False

        # assemble new section fragment
        # try to preserve the character separating base anchor and numeric suffix
        dupl_match = re.match(r"(.+)([_ ])(\d+)$", str(wikilink.title))
        if dupl_match:
            suffix_sep = dupl_match.group(2)
        else:
            suffix_sep = " "
        # get_anchors makes sure to strip markup and handle duplicate section names
        new_fragment = get_anchors(
            headings, pretty=True,
            suffix_sep=suffix_sep)[anchors.index(anchor)]

        # Avoid beautification if there is alternative text and the link
        # actually works.
        if wikilink.text is None or needs_fix is True:
            # preserve title set in check_displaytitle()
            # TODO: simplify (see #25)
            t, _ = wikilink.title.split("#", maxsplit=1)
            wikilink.title = t + "#" + new_fragment
            title.parse(wikilink.title)

        return True
예제 #5
0
    def check_anchor(self, src_title, wikilink, title):
        """
        :returns:
            ``True`` if the anchor is correct or has been corrected, ``False``
            if it is definitely broken, ``None`` if it can't be checked at all
            or the check was indecisive and a warning/error has been printed to
            the log.
        """
        # TODO: beware of https://phabricator.wikimedia.org/T20431

        # we can't check interwiki links
        if title.iwprefix:
            return None

        # empty sectionname is always valid
        if title.sectionname == "":
            return None

        # determine target page
        _target_title = title.make_absolute(src_title)

        # skip links to special pages (e.g. [[Special:Preferences#mw-prefsection-rc]])
        if _target_title.namespacenumber < 0:
            return None

        # resolve redirects
        anchor_on_redirect_to_section = False
        if _target_title.fullpagename in self.api.redirects.map:
            _target_title = self.api.Title(self.api.redirects.resolve(_target_title.fullpagename))
            if _target_title.sectionname:
                logger.warning("warning: section fragment placed on a redirect to possibly different section: {}".format(wikilink))
                anchor_on_redirect_to_section = True

        # get lists of section headings and anchors
        _result = self.db.query(titles=_target_title.fullpagename, prop="sections", secprop={"title", "anchor"})
        _result = list(_result)
        assert len(_result) == 1
        if "missing" in _result[0]:
            logger.error("could not find content of page: '{}' (wikilink {})".format(_target_title.fullpagename, wikilink))
            return None
        headings = [section["title"] for section in _result[0].get("sections", [])]
        anchors = [section["anchor"] for section in _result[0].get("sections", [])]

        if len(headings) == 0:
            logger.warning("wikilink with broken section fragment: {}".format(wikilink))
            return False

        anchor = dotencode(title.sectionname)
        needs_fix = True

        # handle double-anchor redirects first
        if anchor_on_redirect_to_section is True:
            if anchor in anchors:
                return True
            else:
                return False

        # try exact match first
        if anchor in anchors:
            needs_fix = False
        # otherwise try case-insensitive match to detect differences in capitalization
        elif self.interactive is True:
            # FIXME: first detect section renaming properly, fuzzy search should be only the last resort to deal with typos and such
            ranks = get_ranks(anchor, anchors)
            ranks = list(filter(lambda rank: rank[1] >= 0.8, ranks))
            if len(ranks) == 1 or ( len(ranks) >= 2 and ranks[0][1] - ranks[1][1] > 0.2 ):
                logger.debug("wikilink {}: replacing anchor '{}' with '{}' on similarity level {}".format(wikilink, anchor, ranks[0][0], ranks[0][1]))
                anchor = ranks[0][0]
            elif len(ranks) > 1:
                logger.debug("skipping {}: multiple feasible anchors per similarity ratio: {}".format(wikilink, ranks))
                return False
            else:
                logger.warning("wikilink with broken section fragment: {}".format(wikilink))
                return False
        else:
            logger.warning("wikilink with broken section fragment: {}".format(wikilink))
            return False

        # assemble new section fragment
        # try to preserve the character separating base anchor and numeric suffix
        dupl_match = re.match("(.+)([_ ])(\d+)$", str(wikilink.title))
        if dupl_match:
            suffix_sep = dupl_match.group(2)
        else:
            suffix_sep = " "
        # get_anchors makes sure to strip markup and handle duplicate section names
        new_fragment = get_anchors(headings, pretty=True, suffix_sep=suffix_sep)[anchors.index(anchor)]

        # Avoid beautification if there is alternative text and the link
        # actually works.
        if wikilink.text is None or needs_fix is True:
            # preserve title set in check_displaytitle()
            # TODO: simplify (see #25)
            t, _ = wikilink.title.split("#", maxsplit=1)
            wikilink.title = t + "#" + new_fragment
            title.parse(wikilink.title)

        return True
예제 #6
0
    def check_anchor(self, wikilink, title, srcpage):
        # TODO: beware of https://phabricator.wikimedia.org/T20431
        #   - mark with {{Broken fragment}} instead of reporting?
        #   - someday maybe: check N older revisions, section might have been renamed (must be interactive!) or moved to other page (just report)

        # we can't check interwiki links
        if title.iwprefix:
            return True

        # empty sectionname is always valid
        if title.sectionname == "":
            return True

        # lookup target page content
        # TODO: pulling revisions from cache does not expand templates
        #       (transclusions like on List of applications)
        if title.fullpagename:
            _target_ns = title.namespacenumber
            _target_title = title.fullpagename
        else:
            src_title = Title(self.api, srcpage)
            _target_ns = src_title.namespacenumber
            _target_title = src_title.fullpagename
        # skip links to special pages (e.g. [[Special:Preferences#mw-prefsection-rc]])
        if _target_ns < 0:
            return
        if _target_title in self.redirects:
            _new = self.redirects.get(_target_title)
            if "#" not in _new:
                _target_title = _new
            else:
                logger.warning("skipping {} (section fragment placed on a redirect to possibly different section)".format(wikilink))
                return
        pages = self.db_copy[str(_target_ns)]
        wrapped_titles = ws.utils.ListOfDictsAttrWrapper(pages, "title")
        try:
            page = ws.utils.bisect_find(pages, _target_title, index_list=wrapped_titles)
        except IndexError:
            logger.error("could not find content of page: '{}' (wikilink {})".format(_target_title, wikilink))
            return
        text = page["revisions"][0]["*"]

        # get lists of section headings and anchors
        headings = get_section_headings(text)
        if len(headings) == 0:
            logger.warning("wikilink with broken section fragment: {}".format(wikilink))
            return
        anchors = get_anchors(headings)

        anchor = dotencode(title.sectionname)
        needs_fix = True

        # try exact match first
        if anchor in anchors:
            needs_fix = False
        # otherwise try case-insensitive match to detect differences in capitalization
        elif self.interactive is True:
            # FIXME: first detect section renaming properly, fuzzy search should be only the last resort to deal with typos and such
            ranks = get_ranks(anchor, anchors)
            ranks = list(filter(lambda rank: rank[1] >= 0.8, ranks))
            if len(ranks) == 1 or ( len(ranks) >= 2 and ranks[0][1] - ranks[1][1] > 0.2 ):
                logger.debug("wikilink {}: replacing anchor '{}' with '{}' on similarity level {}".format(wikilink, anchor, ranks[0][0], ranks[0][1]))
                anchor = ranks[0][0]
            elif len(ranks) > 1:
                logger.debug("skipping {}: multiple feasible anchors per similarity ratio: {}".format(wikilink, ranks))
                return
            else:
                logger.warning("wikilink with broken section fragment: {}".format(wikilink))
                return
        else:
            logger.warning("wikilink with broken section fragment: {}".format(wikilink))
            return

        # assemble new section fragment
        new_fragment = strip_markup(headings[anchors.index(anchor)])
        # anchors can't contain '[', '|' and ']', encode them manually
        new_fragment = new_fragment.replace("[", ".5B").replace("|", ".7C").replace("]", ".5D")

        # point to the right duplicated sectionname
        # NOTE: the dupl. section number might get changed in fuzzy match
        dupl_match = re.match("(.+)_(\d+)$", anchor)
        if dupl_match:
            base = dupl_match.group(1)
            suffix = dupl_match.group(2)
            if base in anchors:
                if wikilink.title.endswith("_" + suffix):
                    new_fragment += "_" + suffix
                else:
                    new_fragment += " " + suffix

        # Avoid beautification if there is alternative text and the link
        # actually works.
        if wikilink.text is None or needs_fix is True:
            # preserve title set in check_displaytitle()
            # TODO: simplify (see #25)
            t, _ = wikilink.title.split("#", maxsplit=1)
            wikilink.title = t + "#" + new_fragment
            title.parse(wikilink.title)