Пример #1
0
    def find_broken(self):
        def pages_in_namespace(ns):
            return self.api.generator(generator="allpages",
                                      gapfilterredir="nonredirects",
                                      gapnamespace=ns,
                                      gaplimit="max",
                                      prop="categories",
                                      cllimit="max",
                                      clshow="!hidden")

        pages = itertools.chain.from_iterable(
            pages_in_namespace(ns) for ns in self.content_namespaces)

        needs_fixing = []

        for page in pages:
            langname = lang.detect_language(page["title"])[1]
            if "categories" in page:
                for cat in page["categories"]:
                    # skip root categories for non-English languages
                    if page["title"] == "Category:{}".format(
                            langname) and cat["title"] == "Category:Languages":
                        continue

                    # check language
                    if lang.detect_language(cat["title"])[1] != langname:
                        needs_fixing.append(page["pageid"])

        return needs_fixing
Пример #2
0
 def cmp_tuples(left, right):
     if left is None and right is None:
         return 0
     elif left is None:
         return 1
     elif right is None:
         return -1
     return cmp( (-len(left[2]), lang.detect_language(left[0])[0]),
                 (-len(right[2]), lang.detect_language(right[0])[0]) )
Пример #3
0
 def cmp_tuples(left, right):
     if left is None and right is None:
         return 0
     elif left is None:
         return 1
     elif right is None:
         return -1
     return cmp((-len(left[2]), lang.detect_language(left[0])[0]),
                (-len(right[2]), lang.detect_language(right[0])[0]))
Пример #4
0
    def update_wikilink(self, wikicode, wikilink, src_title, summary_parts):
        if str(wikilink) in self.void_update_cache:
            logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink))
            return

        title = self.api.Title(wikilink.title)
        # skip interlanguage links (handled by interlanguage.py)
        if title.iwprefix in self.api.site.interlanguagemap.keys():
            return

        summary = get_edit_checker(wikicode, summary_parts)

        with summary("simplification and beautification of wikilinks"):
            # beautify if urldecoded
            # FIXME: make it implicit - it does not always propagate from the Title class
            if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE):
                # handle links with leading colon properly
                wikilink.title = title.leading_colon + str(title)
                # FIXME: should be done in the Title class
                # the anchor is dot-encoded, but percent-encoding wors for links too
                # and is even rendered nicely
                wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D")

            self.collapse_whitespace_pipe(wikilink)
            self.check_trivial(wikilink, title)
            self.check_relative(src_title, wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)
            self.check_redirect_capitalization(wikilink, title)

            # reparse the title, the redirect checks might change it non-equivalently
            title = self.api.Title(wikilink.title)

            self.check_displaytitle(wikilink, title)

        with summary("fixed section fragments"):
            anchor_result = self.check_anchor(src_title, wikilink, title)
        if anchor_result is False:
            with summary("flagged broken section links"):
                ensure_flagged_by_template(wikicode, wikilink, "Broken section link")
        else:
            with summary("unflagged working section links"):
                ensure_unflagged_by_template(wikicode, wikilink, "Broken section link")

        with summary("simplification and beautification of wikilinks"):
            # partial second pass
            self.check_trivial(wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)

            # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]'
            self.collapse_whitespace(wikicode, wikilink)

        # cache context-less, correct wikilinks that don't need any update
        if title.pagename and len(summary_parts) == 0 and anchor_result is True:
            self.void_update_cache.add(str(wikilink))
Пример #5
0
    def update_wikilink(self, wikicode, wikilink, src_title, summary_parts):
        if str(wikilink) in self.void_update_cache:
            logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink))
            return

        title = self.api.Title(wikilink.title)
        # skip interlanguage links (handled by interlanguage.py)
        if title.iwprefix in self.api.site.interlanguagemap.keys():
            return

        summary = get_edit_checker(wikicode, summary_parts)

        with summary("simplification and beautification of wikilinks"):
            # beautify if urldecoded
            # FIXME: make it implicit - it does not always propagate from the Title class
            if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE):
                # handle links with leading colon properly
                wikilink.title = title.leading_colon + str(title)
                # FIXME: should be done in the Title class
                # the anchor is dot-encoded, but percent-encoding wors for links too
                # and is even rendered nicely
                wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D")

            self.collapse_whitespace_pipe(wikilink)
            self.check_trivial(wikilink, title)
            self.check_relative(src_title, wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)
            self.check_redirect_capitalization(wikilink, title)

            # reparse the title, the redirect checks might change it non-equivalently
            title = self.api.Title(wikilink.title)

            self.check_displaytitle(wikilink, title)

        with summary("fixed section fragments"):
            anchor_result = self.check_anchor(src_title, wikilink, title)
        if anchor_result is False:
            with summary("flagged broken section links"):
                ensure_flagged_by_template(wikicode, wikilink, "Broken section link")
        else:
            with summary("unflagged working section links"):
                ensure_unflagged_by_template(wikicode, wikilink, "Broken section link")

        with summary("simplification and beautification of wikilinks"):
            # partial second pass
            self.check_trivial(wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)

            # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]'
            self.collapse_whitespace(wikicode, wikilink)

        # cache context-less, correct wikilinks that don't need any update
        if title.pagename and len(summary_parts) == 0 and anchor_result is True:
            self.void_update_cache.add(str(wikilink))
Пример #6
0
    def fix_page(title, text_old):
        langname = lang.detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text_old)
        parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True)

        for cat in cats:
            # get_header_parts returns list of wikicode objects, each with one node
            cat = cat.nodes[0]

            pure, ln = lang.detect_language(str(cat.title))
            if ln != langname:
                cat.title = lang.format_title(pure, langname)

        build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Пример #7
0
    def fix_page(title, text_old):
        langname = lang.detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text_old)
        parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True)

        for cat in cats:
            # get_header_parts returns list of wikicode objects, each with one node
            cat = cat.nodes[0]

            pure, ln = lang.detect_language(str(cat.title))
            if ln != langname:
                cat.title = lang.format_title(pure, langname)

        build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Пример #8
0
    def handle_node(self, src_title, wikicode, node, summary_parts):
        # skip links inside article status templates
        parent = wikicode.get(wikicode.index(node, recursive=True))
        if isinstance(parent, mwparserfromhell.nodes.template.Template
                      ) and parent.name.lower() in self.skip_templates:
            return

        if isinstance(node, mwparserfromhell.nodes.Wikilink):
            try:
                self.update_wikilink(wikicode, node, src_title, summary_parts)
            # this can happen, e.g. due to [[{{TALKPAGENAME}}]]
            except InvalidTitleCharError:
                pass
        elif isinstance(node, mwparserfromhell.nodes.Template):
            _pure_template = lang.detect_language(str(node.name))[0]
            if _pure_template.lower() in {"related", "related2"}:
                target = node.get(1).value
                # temporarily convert the {{Related}} to wikilink to reuse the update code
                wl = mwparserfromhell.nodes.wikilink.Wikilink(target)
                wikicode.replace(node, wl)
                # update
                try:
                    self.update_wikilink(wikicode, wl, src_title,
                                         summary_parts)
                # this can happen, e.g. due to [[{{TALKPAGENAME}}]]
                except InvalidTitleCharError:
                    return
                # replace back
                target.value = str(wl.title)
                wikicode.replace(wl, node)
def list_redirects_wrong_capitalization(api):
    # limit to redirects pointing to the main namespace, others deserve special treatment
    redirects = api.redirects.fetch(source_namespaces=[0, 4, 12], target_namespaces=[0])

    # we will count the number of uppercase letters starting each word
    def count_uppercase(text):
        words = text.split()
        firstletters = [word[0] for word in words]
        return sum(1 for c in firstletters if c.isupper())

    for source in sorted(redirects.keys()):
        target = redirects[source].split("#", maxsplit=1)[0]

        # limit to redirects whose source and target title differ only in capitalization
        if source.lower() != target.lower():
            continue

        # limit to multiple-word titles
        pure, _ = lang.detect_language(source)
        if len(pure.split()) == 1:
            continue

        # limit to sentence-case titles redirecting to title-case
        if count_uppercase(source) >= count_uppercase(target):
            continue

        print("* [[{}]] --> [[{}]]".format(source, target))
Пример #10
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
Пример #11
0
    def parse_toc_table(self, title, wikicode):
        toc_table = None
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        dictionary = LowercaseDict()

        for table in wikicode.ifilter_tags(matches=lambda node: node.tag == "table"):
            if table.has("id"):
                id_ = table.get("id")
                if id_.value == "wiki-scripts-toc-table":
                    toc_table = table
                    break

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # extract localized category names (useful even for PlainFormatter)
            dictionary = self.extract_translations(toc_table.contents)

        return toc_table, columns, dictionary
Пример #12
0
    def process_allpages(self, apfrom=None, langnames=None):
        namespaces = [0, 4, 12, 14]

        # rewind to the right namespace (the API throws BadTitle error if the
        # namespace of apfrom does not match apnamespace)
        if apfrom is not None:
            _title = self.api.Title(apfrom)
            if _title.namespacenumber not in namespaces:
                logger.error("Valid namespaces for the --first option are {}.".format([self.api.site.namespaces[ns] for ns in namespaces]))
                return
            while namespaces[0] != _title.namespacenumber:
                del namespaces[0]
            # apfrom must be without namespace prefix
            apfrom = _title.pagename

        for ns in namespaces:
            for page in self.db.query(generator="allpages", gaplimit="max", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom,
                                      prop="latestrevisions", rvprop={"timestamp", "content"}):
                title = page["title"]
                if langnames and lang.detect_language(title)[1] not in langnames:
                    continue
                _title = self.api.Title(title)
                timestamp = page["revisions"][0]["timestamp"]
                text_old = page["revisions"][0]["*"]
                text_new, edit_summary = asyncio.run(self.update_page(title, text_old))
                self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
            # the apfrom parameter is valid only for the first namespace
            apfrom = ""
Пример #13
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
Пример #14
0
    def rename_non_english(self):
        del self.allpages

        # FIXME: starting with English pages is not very good:
        # - some pages are omitted (e.g. when two pages link to the same English page, at least warning should be printed)
        # - it suggests to move e.g. Xfce (Česky) to Xfwm (Česky) because multiple English pages link to it
        # Therefore we limit it only to categories...
        for page in self.allpages:
            title = page["title"]
            if lang.detect_language(
                    title)[1] == "English" and title.startswith("Category:"):
                langlinks = self.get_langlinks(title)
                for tag, localized_title in langlinks:
                    logger.info("Checking [[{}:{}]] for renaming...".format(
                        tag, localized_title))
                    if lang.is_internal_tag(tag) and localized_title != title:
                        source = "{} ({})".format(localized_title,
                                                  lang.langname_for_tag(tag))
                        target = "{} ({})".format(title,
                                                  lang.langname_for_tag(tag))
                        if self._page_exists(target):
                            logger.warning(
                                "Cannot move page [[{}]] to [[{}]]: target page already exists"
                                .format(source, target))
                        else:
                            # interactive mode is necessary because this assumes that all English pages are named correctly
                            ans = ask_yesno(
                                "Move page [[{}]] to [[{}]]?".format(
                                    source, target))
                            if ans is True:
                                summary = "comply with [[Help:I18n#Page titles]] and match the title of the English page"
                                self.api.move(source, target, summary)
def list_redirects_wrong_capitalization(api):
    # limit to redirects pointing to the main namespace, others deserve special treatment
    redirects = api.redirects.fetch(source_namespaces=[0, 4, 12],
                                    target_namespaces=[0])

    # we will count the number of uppercase letters starting each word
    def count_uppercase(text):
        words = text.split()
        firstletters = [word[0] for word in words]
        return sum(1 for c in firstletters if c.isupper())

    for source in sorted(redirects.keys()):
        target = redirects[source].split("#", maxsplit=1)[0]

        # limit to redirects whose source and target title differ only in capitalization
        if source.lower() != target.lower():
            continue

        # limit to multiple-word titles
        pure, _ = lang.detect_language(source)
        if len(pure.split()) == 1:
            continue

        # limit to sentence-case titles redirecting to title-case
        if count_uppercase(source) >= count_uppercase(target):
            continue

        print("* [[{}]] → [[{}]]".format(source, target))
Пример #16
0
    def process_allpages(self, apfrom=None, langnames=None):
        namespaces = [0, 4, 14]
        if self.interactive is True:
            namespaces.append(12)

        # rewind to the right namespace (the API throws BadTitle error if the
        # namespace of apfrom does not match apnamespace)
        if apfrom is not None:
            _title = self.api.Title(apfrom)
            if _title.namespacenumber not in namespaces:
                logger.error("Valid namespaces for the --first option are {}.".format([self.api.site.namespaces[ns] for ns in namespaces]))
                return
            while namespaces[0] != _title.namespacenumber:
                del namespaces[0]
            # apfrom must be without namespace prefix
            apfrom = _title.pagename

        for ns in namespaces:
            for page in self.db.query(generator="allpages", gaplimit="max", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom,
                                      prop="latestrevisions", rvprop={"timestamp", "content"}):
                title = page["title"]
                if langnames and lang.detect_language(title)[1] not in langnames:
                    continue
                _title = self.api.Title(title)
                timestamp = page["revisions"][0]["timestamp"]
                text_old = page["revisions"][0]["*"]
                text_new, edit_summary = self.update_page(title, text_old)
                self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
            # the apfrom parameter is valid only for the first namespace
            apfrom = ""
Пример #17
0
    def parse_toc_table(self, title, toc_table):
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        category_names = LowercaseDict()
        alsoin = {}

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # parse data-toc-alsoin attribute
            if toc_table.has("data-toc-alsoin"):
                alsoin = self.parse_alsoin(
                    title, str(toc_table.get("data-toc-alsoin").value))
            elif columns != ["en"]:
                logger.warning(
                    "Page [[{}]]: missing 'also in' translations".format(
                        title))

            # extract localized category names (useful even for PlainFormatter)
            category_names = self.extract_translations(toc_table.contents)

        return columns, category_names, alsoin
Пример #18
0
 def localized_category(cat, langname):
     pure, lgn = lang.detect_language(cat)
     if pure == "Category:Languages":
         # this terminates the recursive creation
         return pure
     elif pure.lower() == "category:" + lgn.lower():
         return "Category:{}".format(langname)
     return lang.format_title(pure, langname)
Пример #19
0
 def localized_category(cat, langname):
     pure, lgn = lang.detect_language(cat)
     if pure == "Category:Languages":
         # this terminates the recursive creation
         return pure
     elif pure.lower() == "category:" + lgn.lower():
         return "Category:{}".format(langname)
     return lang.format_title(pure, langname)
Пример #20
0
    def _group_into_families(pages, case_sensitive=False):
        """
        Takes list of pages and groups them based on their title. Returns a
        mapping of `family_key` to `family_pages`, where `family_key` is the
        base title without the language suffix (e.g. "Some title" for
        "Some title (Česky)") and `family_pages` is a list of pages belonging
        to the family (have the same `family_key`).
        """

        # interlanguage links are not valid for all languages, the invalid
        # need to be dropped now
        def _valid_interlanguage_pages(pages):
            for page in pages:
                langname = lang.detect_language(page["title"])[1]
                tag = lang.tag_for_langname(langname)
                if lang.is_interlanguage_tag(tag):
                    yield page

        if case_sensitive is True:
            _family_key = lambda page: lang.detect_language(page["title"])[0]
        else:
            _family_key = lambda page: lang.detect_language(page["title"])[
                0].lower()
        pages = sorted(pages, key=_family_key)
        families_groups = itertools.groupby(_valid_interlanguage_pages(pages),
                                            key=_family_key)

        families = {}
        for family, pages in families_groups:
            pages = list(pages)
            tags = set(
                lang.tag_for_langname(lang.detect_language(page["title"])[1])
                for page in pages)
            if len(tags) == len(pages):
                families[family] = pages
            elif case_sensitive is False:
                # sometimes case-insensitive matching is not enough, e.g. [[fish]] is
                # not [[FiSH]] (and neither is redirect)
                families.update(
                    InterlanguageLinks._group_into_families(
                        pages, case_sensitive=True))
            else:
                # this should never happen
                raise Exception
        return families
Пример #21
0
 def add_report_line(self, title, template, message):
     message = "<nowiki>{}</nowiki> ({})".format(template, message)
     lang = detect_language(title)[1]
     if lang not in self.log:
         self.log[lang] = {}
     if title in self.log[lang]:
         self.log[lang][title].append(message)
     else:
         self.log[lang][title] = [message]
 def add_report_line(self, title, template, message):
     message = "<nowiki>{}</nowiki> ({})".format(template, message)
     lang = detect_language(title)[1]
     if lang not in self.log:
         self.log[lang] = {}
     if title in self.log[lang]:
         self.log[lang][title].append(message)
     else:
         self.log[lang][title] = [message]
Пример #23
0
def localize_flag(wikicode, node, template_name):
    """
    If a ``node`` in ``wikicode`` is followed by a template with the same base
    name as ``template_name``, this function changes the adjacent template's
    name to ``template_name``.

    :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object
    :param node: a :py:class:`mwparserfromhell.nodes.Node` object
    :param str template_name: the name of the template flag, potentially
                              including a language name
    """
    parent = get_parent_wikicode(wikicode, node)
    adjacent = get_adjacent_node(parent, node, ignore_whitespace=True)

    if isinstance(adjacent, mwparserfromhell.nodes.Template):
        adjname = lang.detect_language(str(adjacent.name))[0]
        basename = lang.detect_language(template_name)[0]
        if canonicalize(adjname) == canonicalize(basename):
            adjacent.name = template_name
Пример #24
0
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     margin = 1.6 * len(levels)
     lev = ".".join(str(x + 1) for x in levels) + "."
     info = "({})".format(self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.catlink(cat) for cat in parents]
         info += self.format_also_in(parents, lang_tag)
     return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
Пример #25
0
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     margin = 1.6 * len(levels)
     lev = ".".join(str(x + 1) for x in levels) + "."
     info = "({})".format(self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.catlink(cat) for cat in parents]
         info += self.format_also_in(parents, lang_tag)
     return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
Пример #26
0
    def find_broken(self):
        def pages_in_namespace(ns):
            return self.api.generator(generator="allpages", gapfilterredir="nonredirects", gapnamespace=ns, gaplimit="max", prop="categories", cllimit="max", clshow="!hidden")

        pages = itertools.chain.from_iterable(pages_in_namespace(ns) for ns in self.content_namespaces)

        needs_fixing = []

        for page in pages:
            langname = lang.detect_language(page["title"])[1]
            if "categories" in page:
                for cat in page["categories"]:
                    # skip root categories for non-English languages
                    if page["title"] == "Category:{}".format(langname) and cat["title"] == "Category:Languages":
                        continue

                    # check language
                    if lang.detect_language(cat["title"])[1] != langname:
                        needs_fixing.append(page["pageid"])

        return needs_fixing
Пример #27
0
        def _pull_from_page(page, condition=lambda tag, title: True):
            # default to empty tuple
            for langlink in page.get("langlinks", ()):
                tag = langlink["lang"]
                # conversion back and forth is necessary to resolve redirect
                full_title = self._title_from_langlink(langlink)
                title, langname = lang.detect_language(full_title)
                # TODO: check if the resulting tag is equal to the original?
#                tag = lang.tag_for_langname(langname)
                if tag not in tags and condition(tag, title):
                    tags.append(tag)
                    titles.append(title)
Пример #28
0
 def _pull_from_page(page, condition=lambda tag, title: True):
     # default to empty tuple
     for langlink in page.get("langlinks", ()):
         tag = langlink["lang"]
         # conversion back and forth is necessary to resolve redirect
         full_title = self._title_from_langlink(langlink)
         title, langname = lang.detect_language(full_title)
         # TODO: check if the resulting tag is equal to the original?
         #                tag = lang.tag_for_langname(langname)
         if tag not in tags and condition(tag, title):
             tags.append(tag)
             titles.append(title)
    def find_orphans(self):
        if self.allpages is None:
            self.build_graph()

        for page in self.allpages:
            title = page["title"]
            # unsupported languages need to be skipped now
            if not self._is_valid_interlanguage(title):
                continue
            langlinks = self._get_langlinks(title)
            if lang.detect_language(title)[1] != "English" and len(langlinks) == 0:
                print("* [[{}]]".format(title))
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing page [[{}]]...".format(title))
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            # strip whitespace around the parameter, otherwise it is added to
            # the link and rendered incorrectly
            self.strip_whitespace(wikicode, template)

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
Пример #31
0
    def check_extlink_status(self, wikicode, extlink, src_title):
        with self.lock_wikicode:
            url = self.prepare_url(wikicode, extlink)
        if url is None:
            return

        logger.info("Checking link {} ...".format(extlink))
        status = self.check_url(url)

        with self.lock_wikicode:
            if status is True:
                # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code)
                ensure_unflagged_by_template(wikicode,
                                             extlink,
                                             "Dead link",
                                             match_only_prefix=True)
            elif status is False:
                # TODO: handle bbs.archlinux.org (some links may require login)
                # TODO: handle links inside {{man|url=...}} properly
                # first replace the existing template (if any) with a translated version
                flag = self.get_localized_template(
                    "Dead link",
                    lang.detect_language(src_title)[1])
                localize_flag(wikicode, extlink, flag)
                # flag the link, but don't overwrite date and don't set status yet
                flag = ensure_flagged_by_template(wikicode,
                                                  extlink,
                                                  flag,
                                                  *self.deadlink_params,
                                                  overwrite_parameters=False)
                # drop the fragment from the URL before looking into the cache
                if url.fragment:
                    url = urllib3.util.url.parse_url(
                        url.url.rsplit("#", maxsplit=1)[0])
                # overwrite by default, but skip overwriting date when the status matches
                overwrite = True
                if flag.has("status"):
                    status = flag.get("status").value
                    if str(status) == str(self.cache_invalid_urls[url]):
                        overwrite = False
                if overwrite is True:
                    # overwrite status as well as date
                    flag.add("status",
                             self.cache_invalid_urls[url],
                             showkey=True)
                    flag.add("1", self.deadlink_params[0], showkey=False)
                    flag.add("2", self.deadlink_params[1], showkey=False)
                    flag.add("3", self.deadlink_params[2], showkey=False)
            else:
                # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls
                logger.warning(
                    "status check indeterminate for external link {}".format(
                        extlink))
Пример #32
0
 def extract_translations(self, wikicode):
     dictionary = LowercaseDict()
     for wikilink in wikicode.ifilter_wikilinks(recursive=True):
         # skip catlinks without leading colon
         if not wikilink.title.startswith(":"):
             continue
         title = self.api.Title(wikilink.title)
         if title.namespace == "Category" and wikilink.text:
             # skip trivial cases to apply our defaults
             pure, _ = lang.detect_language(title.pagename)
             if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower():
                 dictionary[str(title)] = str(wikilink.text).strip()
     return dictionary
Пример #33
0
 def process_allpages(self, apfrom=None):
     namespaces = [0, 14]
     if self.interactive is True:
         namespaces.append(12)
     for ns in namespaces:
         for page in self.api.generator(generator="allpages", gaplimit="100", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom, prop="revisions", rvprop="content|timestamp"):
             title = page["title"]
             if lang.detect_language(title)[1] != "English":
                 continue
             timestamp = page["revisions"][0]["timestamp"]
             text_old = page["revisions"][0]["*"]
             text_new, edit_summary = self.update_page(title, text_old)
             self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
Пример #34
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])

        # conversion back-and-forth is necessary to add the "(Language)" suffix into all subpage parts
        new_langlinks = []
        for tag, title in langlinks:
            new_title = lang.format_title(title, lang.langname_for_tag(tag))
            # do it only when the new_title exists, otherwise the title without "(Language)" in
            # all subpage parts is still valid as per Help:i18n
            if self._page_exists(new_title):
                title = lang.detect_language(new_title,
                                             strip_all_subpage_parts=False)[0]
            new_langlinks.append((tag, title))

        return new_langlinks
Пример #35
0
 def extract_translations(self, wikicode):
     dictionary = LowercaseDict()
     for wikilink in wikicode.ifilter_wikilinks(recursive=True):
         # skip catlinks without leading colon
         if not wikilink.title.startswith(":"):
             continue
         title = self.api.Title(wikilink.title)
         if title.namespace == "Category" and wikilink.text:
             # skip trivial cases to apply our defaults
             pure, _ = lang.detect_language(title.pagename)
             if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower():
                 dictionary[str(title)] = str(wikilink.text).strip()
     return dictionary
Пример #36
0
    def create_category(self, category):
        title = self.api.Title(category)
        if title.iwprefix or title.namespace != "Category":
            raise ValueError("Invalid category name: [[{}]]".format(category))
        # normalize name
        category = title.fullpagename

        # skip existing categories
        if category in self.info:
            return

        pure, langname = lang.detect_language(category)
        if langname == lang.get_local_language():
            logger.warning(
                "Cannot automatically create {} category: [[{}]]".format(
                    lang.get_local_language(), category))
            return

        local = lang.format_title(pure, lang.get_local_language())
        if local not in self.info:
            logger.warning(
                "Cannot create category [[{}]]: {} category [[{}]] does not exist."
                .format(category, lang.get_local_language(), local))
            return

        def localized_category(cat, langname):
            pure, lgn = lang.detect_language(cat)
            if pure == "Category:Languages":
                # this terminates the recursive creation
                return pure
            elif pure.lower() == "category:" + lgn.lower():
                return "Category:{}".format(langname)
            return lang.format_title(pure, langname)

        if local in self.parents.keys():
            parents = [
                localized_category(p, langname) for p in self.parents[local]
            ]
            content = "\n".join("[[{}]]".format(p) for p in parents)
        else:
            parents = None
            content = ""

        self.api.create(title=category,
                        text=content,
                        summary="init wanted category")
        self.update()

        if parents is not None:
            for p in parents:
                self.create_category(p)
Пример #37
0
    def _group_into_families(pages, case_sensitive=False):
        """
        Takes list of pages and groups them based on their title. Returns a
        mapping of `family_key` to `family_pages`, where `family_key` is the
        base title without the language suffix (e.g. "Some title" for
        "Some title (Česky)") and `family_pages` is a list of pages belonging
        to the family (have the same `family_key`).
        """
        # interlanguage links are not valid for all languages, the invalid
        # need to be dropped now
        def _valid_interlanguage_pages(pages):
            for page in pages:
                langname = lang.detect_language(page["title"])[1]
                tag = lang.tag_for_langname(langname)
                if lang.is_interlanguage_tag(tag):
                    yield page

        if case_sensitive is True:
            _family_key = lambda page: lang.detect_language(page["title"])[0]
        else:
            _family_key = lambda page: lang.detect_language(page["title"])[0].lower()
        pages = sorted(pages, key=_family_key)
        families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key)

        families = {}
        for family, pages in families_groups:
            pages = list(pages)
            tags = set(lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages)
            if len(tags) == len(pages):
                families[family] = pages
            elif case_sensitive is False:
                # sometimes case-insensitive matching is not enough, e.g. [[fish]] is
                # not [[FiSH]] (and neither is redirect)
                families.update(InterlanguageLinks._group_into_families(pages, case_sensitive=True))
            else:
                # this should never happen
                raise Exception
        return families
Пример #38
0
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     # indent
     output = " " * len(levels) * 4
     # level
     output += ".".join(str(x + 1) for x in levels)
     # title, number of subpages
     output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.localize(cat) for cat in parents]
         output += self.format_also_in(parents, lang_tag)
     return output
Пример #39
0
 def find_orphans(self):
     """
     Returns list of pages that are alone in their families.
     """
     orphans = []
     for page in self.allpages:
         title = page["title"]
         # unsupported languages need to be skipped now
         if not self._is_valid_interlanguage(title):
             continue
         langlinks = self.get_langlinks(title)
         if lang.detect_language(title)[1] != lang.get_local_language() and len(langlinks) == 0:
             orphans.append(title)
     return orphans
Пример #40
0
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     # indent
     output = " " * len(levels) * 4
     # level
     output += ".".join(str(x + 1) for x in levels)
     # title, number of subpages
     output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.localize(cat) for cat in parents]
         output += self.format_also_in(parents, lang_tag)
     return output
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing '%s'..." % title)
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            parent = get_parent_wikicode(wikicode, template)
            adjacent = get_adjacent_node(parent, template, ignore_whitespace=True)
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint)
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # replace since the hint might be different
                    wikicode.replace(adjacent, broken_flag)
                else:
                    wikicode.insert_after(template, broken_flag)
            else:
                if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"):
                    # package has been found again, remove existing flag
                    wikicode.remove(adjacent)

        return wikicode
Пример #42
0
    def generate_pages(self):
        # handle the trivial case first
        if self.title is not None:
            result = self.api.call_api(action="query",
                                       prop="revisions",
                                       rvprop="content|timestamp",
                                       rvslots="main",
                                       titles=self.title)
            yield list(result["pages"].values())[0]
            return

        # clone the list of namespaces so that we can modify it for this method
        namespaces = self.namespaces.copy()

        # rewind to the right namespace (the API throws BadTitle error if the
        # namespace of apfrom does not match apnamespace)
        apfrom = self.first
        if apfrom is not None:
            _title = self.api.Title(apfrom)
            if _title.namespacenumber not in namespaces:
                logger.error(
                    "Valid namespaces for the --first option are {}.".format(
                        [self.api.site.namespaces[ns] for ns in namespaces]))
                return
            while namespaces[0] != _title.namespacenumber:
                del namespaces[0]
            # apfrom must be without namespace prefix
            apfrom = _title.pagename

        for ns in namespaces:
            for page in self.api.generator(generator="allpages",
                                           gaplimit="100",
                                           gapnamespace=ns,
                                           gapfrom=apfrom,
                                           gapfilterredir=self.apfilterredir,
                                           prop="revisions",
                                           rvprop="content|timestamp",
                                           rvslots="main"):
                # if the user is not logged in, the limit for revisions may be lower than gaplimit,
                # in which case the generator will yield some pages multiple times without revisions
                # before the query-continuation kicks in
                if "revisions" not in page:
                    continue
                if self.langnames and lang.detect_language(
                        page["title"])[1] not in self.langnames:
                    continue
                yield page
            # the apfrom parameter is valid only for the first namespace
            apfrom = ""
Пример #43
0
 def find_orphans(self):
     """
     Returns list of pages that are alone in their families.
     """
     orphans = []
     for page in self.allpages:
         title = page["title"]
         # unsupported languages need to be skipped now
         if not self._is_valid_interlanguage(title):
             continue
         langlinks = self.get_langlinks(title)
         if lang.detect_language(title)[1] != lang.get_local_language(
         ) and len(langlinks) == 0:
             orphans.append(title)
     return orphans
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing page [[{}]]...".format(title))
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            # strip whitespace around the parameter, otherwise it is added to
            # the link and rendered incorrectly
            self.strip_whitespace(wikicode, template)

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                # first unflag since the localized template might change
                ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True)
                # flag with a localized template and hint
                flag = self.get_localized_template("Broken package link", lang)
                ensure_flagged_by_template(wikicode, template, flag, hint, overwrite_parameters=True)
            else:
                ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True)

        return wikicode
Пример #45
0
 def parse_alsoin(self, title, value):
     alsoin = {}
     for item in value.split(","):
         item = item.strip()
         try:
             tag, translation = item.split(":", maxsplit=1)
             tag = tag.strip()
             translation = translation.strip()
             if not lang.is_language_tag(tag):
                 raise ValueError
         except ValueError:
             tag = lang.tag_for_langname(lang.detect_language(title)[1])
             translation = item
         alsoin[tag] = translation
     logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
     return alsoin
Пример #46
0
 def parse_alsoin(self, title, value):
     alsoin = {}
     for item in value.split(","):
         item = item.strip()
         try:
             tag, translation = item.split(":", maxsplit=1)
             tag = tag.strip()
             translation = translation.strip()
             if not lang.is_language_tag(tag):
                 raise ValueError
         except ValueError:
             tag = lang.tag_for_langname(lang.detect_language(title)[1])
             translation = item
         alsoin[tag] = translation
     logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
     return alsoin
    def _get_langlinks(self, full_title):
        """
        Uses :py:meth:`self._titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self._titles_in_family(full_title)
        langlinks = set(zip(tags, titles))
        # remove title of the page to be updated
        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)
        langlinks.remove((tag, title))
        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
Пример #48
0
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
Пример #49
0
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
Пример #50
0
def update_page_language(api):
    # ensure that we are authenticated
    require_login(api)

    namespaces = [0, 4, 10, 12, 14]
    for ns in namespaces:
        for page in api.generator(generator="allpages",
                                  gapnamespace=ns,
                                  gaplimit="max",
                                  prop="info"):
            title = page["title"]
            pagelanguage = page["pagelanguage"]

            pure, langname = lang.detect_language(title)
            langtag = lang.tag_for_langname(langname)

            if pagelanguage != langtag:
                api.set_page_language(
                    title, langtag, "update language based on the page title")
Пример #51
0
def main(api):
    templates = [
        "Template:Article summary start",
        "Template:Article summary heading",
        "Template:Article summary link",
        "Template:Article summary text",
        "Template:Article summary wiki",
        "Template:Article summary end"
    ]
    pages_gen = (api.generator(generator="embeddedin", geilimit="max", geititle=title) for title in templates)
    pages = itertools.chain.from_iterable(pages_gen)
    titles = set(page["title"] for page in pages)

    # print only languages for which "Template:Related articles start (<lang>)" exists
    langs_whitelist = ["English", "Español", "Italiano", "Português", "Česky", "Ελληνικά", "Русский", "正體中文", "简体中文", "한국어"]

    for title in sorted(titles):
        # detect language, check whitelist
        _, lang = detect_language(title)
        if lang in langs_whitelist:
            print("* [[%s]]" % title)
def main(api):
    require_login(api)

    # check for necessary rights
    if "unwatchedpages" not in api.user.rights:
        print(
            "The current user does not have the 'unwatchedpages' right, which is necessary to use this script. Sorry."
        )
        sys.exit(1)

    # get list of unwatched pages
    query_unwatched = {
        "action": "query",
        "list": "querypage",
        "qppage": "Unwatchedpages",
        "qplimit": "max",
        "continue": "",
    }

    # list flattening, limit to the Main namespace
    unwatched = (page for snippet in api.query_continue(query_unwatched)
                 for page in snippet["querypage"]["results"]
                 if page["ns"] == 0)

    # split into sections by language
    by_language = {}
    for page in unwatched:
        title = page["title"]
        lang = detect_language(title)[1]
        if lang not in by_language:
            by_language[lang] = []
        by_language[lang].append(title)

    # print wikitext
    for lang in sorted(by_language.keys()):
        print("== %s ==" % lang)
        print()
        for title in by_language[lang]:
            print("* %s" % title)
        print()
Пример #53
0
    def check_page(self, title):
        # check the language
        base, lang = detect_language(title)
        new_lang = self.lang_map.get(lang)
        if not new_lang:
            return

        # format_title does not work when the script is run before updating the
        # interwiki table and the ws.ArchWiki.lang module
        #new_title = format_title(base, new_lang)
        if title == f"Category:{lang}":
            new_title = f"Category:{new_lang}"
        else:
            new_title = title.replace(f"({lang})", f"({new_lang})")

        summary = self.edit_summary.format(old_lang=lang, new_lang=new_lang)
        logger.info(f"Move [[{title}]] to [[{new_title}]] ({summary})")
        try:
            self.api.move(title, new_title, summary, movesubpages=False)
        except APIError:
            # skip errors
            pass
Пример #54
0
    def create_category(self, category):
        title = self.api.Title(category)
        if title.iwprefix or title.namespace != "Category":
            raise ValueError("Invalid category name: [[{}]]".format(category))
        # normalize name
        category = title.fullpagename

        # skip existing categories
        if category in self.info:
            return

        pure, langname = lang.detect_language(category)
        if langname == lang.get_local_language():
            logger.warning("Cannot automatically create {} category: [[{}]]".format(lang.get_local_language(), category))
            return

        local = lang.format_title(pure, lang.get_local_language())
        if local not in self.info:
            logger.warning("Cannot create category [[{}]]: {} category [[{}]] does not exist.".format(category, lang.get_local_language(), local))
            return

        def localized_category(cat, langname):
            pure, lgn = lang.detect_language(cat)
            if pure == "Category:Languages":
                # this terminates the recursive creation
                return pure
            elif pure.lower() == "category:" + lgn.lower():
                return "Category:{}".format(langname)
            return lang.format_title(pure, langname)

        parents = [localized_category(p, langname) for p in self.parents[local]]
        content = "\n".join("[[{}]]".format(p) for p in parents)

        self.api.create(title=category, text=content, summary="init wanted category")
        self.update()

        for p in parents:
            self.create_category(p)
Пример #55
0
    def rename_non_english(self):
        del self.allpages

        # FIXME: starting with English pages is not very good:
        # - some pages are omitted (e.g. when two pages link to the same English page, at least warning should be printed)
        # - it suggests to move e.g. Xfce (Česky) to Xfwm (Česky) because multiple English pages link to it
        # Therefore we limit it only to categories...
        for page in self.allpages:
            title = page["title"]
            if lang.detect_language(title)[1] == "English" and title.startswith("Category:"):
                langlinks = self.get_langlinks(title)
                for tag, localized_title in langlinks:
                    logger.info("Checking [[{}:{}]] for renaming...".format(tag, localized_title))
                    if lang.is_internal_tag(tag) and localized_title != title:
                        source = "{} ({})".format(localized_title, lang.langname_for_tag(tag))
                        target = "{} ({})".format(title, lang.langname_for_tag(tag))
                        if self._page_exists(target):
                            logger.warning("Cannot move page [[{}]] to [[{}]]: target page already exists".format(source, target))
                        else:
                            # interactive mode is necessary because this assumes that all English pages are named correctly
                            ans = ask_yesno("Move page [[{}]] to [[{}]]?".format(source, target))
                            if ans is True:
                                summary = "comply with [[Help:I18n#Page titles]] and match the title of the English page"
                                self.api.move(source, target, summary)
Пример #56
0
    def parse_toc_table(self, title, toc_table):
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        category_names = LowercaseDict()
        alsoin = {}

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # parse data-toc-alsoin attribute
            if toc_table.has("data-toc-alsoin"):
                alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value))
            elif columns != ["en"]:
                logger.warning("Page [[{}]]: missing 'also in' translations".format(title))

            # extract localized category names (useful even for PlainFormatter)
            category_names = self.extract_translations(toc_table.contents)

        return columns, category_names, alsoin
Пример #57
0
def main(api):
    require_login(api)

    # check for necessary rights
    if "unwatchedpages" not in api.user.rights:
        print("The current user does not have the 'unwatchedpages' right, which is necessary to use this script. Sorry.")
        sys.exit(1)

    # get list of unwatched pages
    query_unwatched = {
        "action": "query",
        "list": "querypage",
        "qppage": "Unwatchedpages",
        "qplimit": "max",
        "continue": "",
    }

    # list flattening, limit to the Main namespace
    unwatched = (page for snippet in api.query_continue(query_unwatched) for page in snippet["querypage"]["results"] if page["ns"] == 0)

    # split into sections by language
    by_language = {}
    for page in unwatched:
        title = page["title"]
        lang = detect_language(title)[1]
        if lang not in by_language:
            by_language[lang] = []
        by_language[lang].append(title)

    # print wikitext
    for lang in sorted(by_language.keys()):
        print("== %s ==" % lang)
        print()
        for title in by_language[lang]:
            print("* %s" % title)
        print()
Пример #58
0
    def update_page(self, src_title, text):
        """
        Parse the content of the page and call various methods to update the links.

        :param str src_title: title of the page
        :param str text: content of the page
        :returns: a (text, edit_summary) tuple, where text is the updated content
            and edit_summary is the description of performed changes
        """
        # FIXME: ideally "DeveloperWiki:" would be a proper namespace
        if lang.detect_language(src_title)[0] in self.skip_pages or src_title.startswith("DeveloperWiki:"):
            logger.info("Skipping blacklisted page [[{}]]".format(src_title))
            return text, ""

        logger.info("Parsing page [[{}]] ...".format(src_title))
        # FIXME: skip_style_tags=True is a partial workaround for https://github.com/earwig/mwparserfromhell/issues/40
        wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
        summary_parts = []

        summary = get_edit_checker(wikicode, summary_parts)

        for extlink in wikicode.ifilter_external_links(recursive=True):
            # skip links inside article status templates
            parent = wikicode.get(wikicode.index(extlink, recursive=True))
            if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
                continue
            with summary("replaced external links"):
                self.update_extlink(wikicode, extlink)

        for wikilink in wikicode.ifilter_wikilinks(recursive=True):
            # skip links inside article status templates
            parent = wikicode.get(wikicode.index(wikilink, recursive=True))
            if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
                continue
            self.update_wikilink(wikicode, wikilink, src_title, summary_parts)

        for template in wikicode.ifilter_templates(recursive=True):
            # skip templates that may be added or removed
            if str(template.name) in {"Broken section link", "Dead link"}:
                continue
            # skip links inside article status templates
            parent = wikicode.get(wikicode.index(template, recursive=True))
            if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
                continue
            _pure_template = lang.detect_language(str(template.name))[0]
            if _pure_template.lower() in {"related", "related2"}:
                target = template.get(1).value
                # temporarily convert the {{Related}} to wikilink to reuse the update code
                wl = mwparserfromhell.nodes.wikilink.Wikilink(target)
                wikicode.replace(template, wl)
                # update
                self.update_wikilink(wikicode, wl, src_title, summary_parts)
                # replace back
                target.value = str(wl.title)
                wikicode.replace(wl, template)
            elif template.name.lower() == "man":
                with summary("updated man page links"):
                    self.update_man_template(wikicode, template)

        # deduplicate and keep order
        parts = set()
        parts_add = parts.add
        summary_parts = [part for part in summary_parts if not (part in parts or parts_add(part))]

        edit_summary = ", ".join(summary_parts)
        if self.interactive is True:
            edit_summary += " (interactive)"

        return str(wikicode), edit_summary
Пример #59
0
 def print_titles(titles):
     for title in sorted(titles):
         if lang.detect_language(title)[1] == "English":
             print("* [[%s]]" % title)
     print()
#! /usr/bin/env python3

import os.path
from collections import namedtuple
import itertools

from ws.client import API
import ws.ArchWiki.lang as lang

api_url = "https://wiki.archlinux.org/api.php"
cookie_path = os.path.expanduser("~/.cache/ArchWiki.cookie")

api = API(api_url, cookie_file=cookie_path, ssl_verify=True)

Page = namedtuple("Page", ["title", "langname", "pure"])

pages = []
for page in api.generator(generator="allpages", gaplimit="max", gapfilterredir="nonredirects"):
    pure, langname = lang.detect_language(page["title"])
    pages.append(Page(page["title"], langname, pure))

pages.sort(key=lambda page: (page.langname, page.pure))

groups = itertools.groupby(pages, key=lambda page: page.langname)
for langname, pages in groups:
    print("== {} ==\n".format(langname))
    for page in pages:
        print("* [[:{}|{}]]".format(page.title, page.pure))
    print()