def find_broken(self): def pages_in_namespace(ns): return self.api.generator(generator="allpages", gapfilterredir="nonredirects", gapnamespace=ns, gaplimit="max", prop="categories", cllimit="max", clshow="!hidden") pages = itertools.chain.from_iterable( pages_in_namespace(ns) for ns in self.content_namespaces) needs_fixing = [] for page in pages: langname = lang.detect_language(page["title"])[1] if "categories" in page: for cat in page["categories"]: # skip root categories for non-English languages if page["title"] == "Category:{}".format( langname) and cat["title"] == "Category:Languages": continue # check language if lang.detect_language(cat["title"])[1] != langname: needs_fixing.append(page["pageid"]) return needs_fixing
def cmp_tuples(left, right): if left is None and right is None: return 0 elif left is None: return 1 elif right is None: return -1 return cmp( (-len(left[2]), lang.detect_language(left[0])[0]), (-len(right[2]), lang.detect_language(right[0])[0]) )
def cmp_tuples(left, right): if left is None and right is None: return 0 elif left is None: return 1 elif right is None: return -1 return cmp((-len(left[2]), lang.detect_language(left[0])[0]), (-len(right[2]), lang.detect_language(right[0])[0]))
def update_wikilink(self, wikicode, wikilink, src_title, summary_parts): if str(wikilink) in self.void_update_cache: logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink)) return title = self.api.Title(wikilink.title) # skip interlanguage links (handled by interlanguage.py) if title.iwprefix in self.api.site.interlanguagemap.keys(): return summary = get_edit_checker(wikicode, summary_parts) with summary("simplification and beautification of wikilinks"): # beautify if urldecoded # FIXME: make it implicit - it does not always propagate from the Title class if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE): # handle links with leading colon properly wikilink.title = title.leading_colon + str(title) # FIXME: should be done in the Title class # the anchor is dot-encoded, but percent-encoding wors for links too # and is even rendered nicely wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D") self.collapse_whitespace_pipe(wikilink) self.check_trivial(wikilink, title) self.check_relative(src_title, wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) self.check_redirect_capitalization(wikilink, title) # reparse the title, the redirect checks might change it non-equivalently title = self.api.Title(wikilink.title) self.check_displaytitle(wikilink, title) with summary("fixed section fragments"): anchor_result = self.check_anchor(src_title, wikilink, title) if anchor_result is False: with summary("flagged broken section links"): ensure_flagged_by_template(wikicode, wikilink, "Broken section link") else: with summary("unflagged working section links"): ensure_unflagged_by_template(wikicode, wikilink, "Broken section link") with summary("simplification and beautification of wikilinks"): # partial second pass self.check_trivial(wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]' self.collapse_whitespace(wikicode, wikilink) # cache context-less, correct wikilinks that don't need any update if title.pagename and len(summary_parts) == 0 and anchor_result is True: self.void_update_cache.add(str(wikilink))
def update_wikilink(self, wikicode, wikilink, src_title, summary_parts): if str(wikilink) in self.void_update_cache: logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink)) return title = self.api.Title(wikilink.title) # skip interlanguage links (handled by interlanguage.py) if title.iwprefix in self.api.site.interlanguagemap.keys(): return summary = get_edit_checker(wikicode, summary_parts) with summary("simplification and beautification of wikilinks"): # beautify if urldecoded # FIXME: make it implicit - it does not always propagate from the Title class if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE): # handle links with leading colon properly wikilink.title = title.leading_colon + str(title) # FIXME: should be done in the Title class # the anchor is dot-encoded, but percent-encoding wors for links too # and is even rendered nicely wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D") self.collapse_whitespace_pipe(wikilink) self.check_trivial(wikilink, title) self.check_relative(src_title, wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) self.check_redirect_capitalization(wikilink, title) # reparse the title, the redirect checks might change it non-equivalently title = self.api.Title(wikilink.title) self.check_displaytitle(wikilink, title) with summary("fixed section fragments"): anchor_result = self.check_anchor(src_title, wikilink, title) if anchor_result is False: with summary("flagged broken section links"): ensure_flagged_by_template(wikicode, wikilink, "Broken section link") else: with summary("unflagged working section links"): ensure_unflagged_by_template(wikicode, wikilink, "Broken section link") with summary("simplification and beautification of wikilinks"): # partial second pass self.check_trivial(wikilink, title) if lang.detect_language(src_title)[1] == "English": self.check_redirect_exact(src_title, wikilink, title) # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]' self.collapse_whitespace(wikicode, wikilink) # cache context-less, correct wikilinks that don't need any update if title.pagename and len(summary_parts) == 0 and anchor_result is True: self.void_update_cache.add(str(wikilink))
def fix_page(title, text_old): langname = lang.detect_language(title)[1] wikicode = mwparserfromhell.parse(text_old) parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True) for cat in cats: # get_header_parts returns list of wikicode objects, each with one node cat = cat.nodes[0] pure, ln = lang.detect_language(str(cat.title)) if ln != langname: cat.title = lang.format_title(pure, langname) build_header(wikicode, parent, magics, cats, langlinks) return wikicode
def fix_page(title, text_old): langname = lang.detect_language(title)[1] wikicode = mwparserfromhell.parse(text_old) parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True) for cat in cats: # get_header_parts returns list of wikicode objects, each with one node cat = cat.nodes[0] pure, ln = lang.detect_language(str(cat.title)) if ln != langname: cat.title = lang.format_title(pure, langname) build_header(wikicode, parent, magics, cats, langlinks) return wikicode
def handle_node(self, src_title, wikicode, node, summary_parts): # skip links inside article status templates parent = wikicode.get(wikicode.index(node, recursive=True)) if isinstance(parent, mwparserfromhell.nodes.template.Template ) and parent.name.lower() in self.skip_templates: return if isinstance(node, mwparserfromhell.nodes.Wikilink): try: self.update_wikilink(wikicode, node, src_title, summary_parts) # this can happen, e.g. due to [[{{TALKPAGENAME}}]] except InvalidTitleCharError: pass elif isinstance(node, mwparserfromhell.nodes.Template): _pure_template = lang.detect_language(str(node.name))[0] if _pure_template.lower() in {"related", "related2"}: target = node.get(1).value # temporarily convert the {{Related}} to wikilink to reuse the update code wl = mwparserfromhell.nodes.wikilink.Wikilink(target) wikicode.replace(node, wl) # update try: self.update_wikilink(wikicode, wl, src_title, summary_parts) # this can happen, e.g. due to [[{{TALKPAGENAME}}]] except InvalidTitleCharError: return # replace back target.value = str(wl.title) wikicode.replace(wl, node)
def list_redirects_wrong_capitalization(api): # limit to redirects pointing to the main namespace, others deserve special treatment redirects = api.redirects.fetch(source_namespaces=[0, 4, 12], target_namespaces=[0]) # we will count the number of uppercase letters starting each word def count_uppercase(text): words = text.split() firstletters = [word[0] for word in words] return sum(1 for c in firstletters if c.isupper()) for source in sorted(redirects.keys()): target = redirects[source].split("#", maxsplit=1)[0] # limit to redirects whose source and target title differ only in capitalization if source.lower() != target.lower(): continue # limit to multiple-word titles pure, _ = lang.detect_language(source) if len(pure.split()) == 1: continue # limit to sentence-case titles redirecting to title-case if count_uppercase(source) >= count_uppercase(target): continue print("* [[{}]] --> [[{}]]".format(source, target))
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def parse_toc_table(self, title, wikicode): toc_table = None # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] dictionary = LowercaseDict() for table in wikicode.ifilter_tags(matches=lambda node: node.tag == "table"): if table.has("id"): id_ = table.get("id") if id_.value == "wiki-scripts-toc-table": toc_table = table break if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # extract localized category names (useful even for PlainFormatter) dictionary = self.extract_translations(toc_table.contents) return toc_table, columns, dictionary
def process_allpages(self, apfrom=None, langnames=None): namespaces = [0, 4, 12, 14] # rewind to the right namespace (the API throws BadTitle error if the # namespace of apfrom does not match apnamespace) if apfrom is not None: _title = self.api.Title(apfrom) if _title.namespacenumber not in namespaces: logger.error("Valid namespaces for the --first option are {}.".format([self.api.site.namespaces[ns] for ns in namespaces])) return while namespaces[0] != _title.namespacenumber: del namespaces[0] # apfrom must be without namespace prefix apfrom = _title.pagename for ns in namespaces: for page in self.db.query(generator="allpages", gaplimit="max", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom, prop="latestrevisions", rvprop={"timestamp", "content"}): title = page["title"] if langnames and lang.detect_language(title)[1] not in langnames: continue _title = self.api.Title(title) timestamp = page["revisions"][0]["timestamp"] text_old = page["revisions"][0]["*"] text_new, edit_summary = asyncio.run(self.update_page(title, text_old)) self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary) # the apfrom parameter is valid only for the first namespace apfrom = ""
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def rename_non_english(self): del self.allpages # FIXME: starting with English pages is not very good: # - some pages are omitted (e.g. when two pages link to the same English page, at least warning should be printed) # - it suggests to move e.g. Xfce (Česky) to Xfwm (Česky) because multiple English pages link to it # Therefore we limit it only to categories... for page in self.allpages: title = page["title"] if lang.detect_language( title)[1] == "English" and title.startswith("Category:"): langlinks = self.get_langlinks(title) for tag, localized_title in langlinks: logger.info("Checking [[{}:{}]] for renaming...".format( tag, localized_title)) if lang.is_internal_tag(tag) and localized_title != title: source = "{} ({})".format(localized_title, lang.langname_for_tag(tag)) target = "{} ({})".format(title, lang.langname_for_tag(tag)) if self._page_exists(target): logger.warning( "Cannot move page [[{}]] to [[{}]]: target page already exists" .format(source, target)) else: # interactive mode is necessary because this assumes that all English pages are named correctly ans = ask_yesno( "Move page [[{}]] to [[{}]]?".format( source, target)) if ans is True: summary = "comply with [[Help:I18n#Page titles]] and match the title of the English page" self.api.move(source, target, summary)
def list_redirects_wrong_capitalization(api): # limit to redirects pointing to the main namespace, others deserve special treatment redirects = api.redirects.fetch(source_namespaces=[0, 4, 12], target_namespaces=[0]) # we will count the number of uppercase letters starting each word def count_uppercase(text): words = text.split() firstletters = [word[0] for word in words] return sum(1 for c in firstletters if c.isupper()) for source in sorted(redirects.keys()): target = redirects[source].split("#", maxsplit=1)[0] # limit to redirects whose source and target title differ only in capitalization if source.lower() != target.lower(): continue # limit to multiple-word titles pure, _ = lang.detect_language(source) if len(pure.split()) == 1: continue # limit to sentence-case titles redirecting to title-case if count_uppercase(source) >= count_uppercase(target): continue print("* [[{}]] → [[{}]]".format(source, target))
def process_allpages(self, apfrom=None, langnames=None): namespaces = [0, 4, 14] if self.interactive is True: namespaces.append(12) # rewind to the right namespace (the API throws BadTitle error if the # namespace of apfrom does not match apnamespace) if apfrom is not None: _title = self.api.Title(apfrom) if _title.namespacenumber not in namespaces: logger.error("Valid namespaces for the --first option are {}.".format([self.api.site.namespaces[ns] for ns in namespaces])) return while namespaces[0] != _title.namespacenumber: del namespaces[0] # apfrom must be without namespace prefix apfrom = _title.pagename for ns in namespaces: for page in self.db.query(generator="allpages", gaplimit="max", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom, prop="latestrevisions", rvprop={"timestamp", "content"}): title = page["title"] if langnames and lang.detect_language(title)[1] not in langnames: continue _title = self.api.Title(title) timestamp = page["revisions"][0]["timestamp"] text_old = page["revisions"][0]["*"] text_new, edit_summary = self.update_page(title, text_old) self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary) # the apfrom parameter is valid only for the first namespace apfrom = ""
def parse_toc_table(self, title, toc_table): # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] category_names = LowercaseDict() alsoin = {} if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # parse data-toc-alsoin attribute if toc_table.has("data-toc-alsoin"): alsoin = self.parse_alsoin( title, str(toc_table.get("data-toc-alsoin").value)) elif columns != ["en"]: logger.warning( "Page [[{}]]: missing 'also in' translations".format( title)) # extract localized category names (useful even for PlainFormatter) category_names = self.extract_translations(toc_table.contents) return columns, category_names, alsoin
def localized_category(cat, langname): pure, lgn = lang.detect_language(cat) if pure == "Category:Languages": # this terminates the recursive creation return pure elif pure.lower() == "category:" + lgn.lower(): return "Category:{}".format(langname) return lang.format_title(pure, langname)
def localized_category(cat, langname): pure, lgn = lang.detect_language(cat) if pure == "Category:Languages": # this terminates the recursive creation return pure elif pure.lower() == "category:" + lgn.lower(): return "Category:{}".format(langname) return lang.format_title(pure, langname)
def _group_into_families(pages, case_sensitive=False): """ Takes list of pages and groups them based on their title. Returns a mapping of `family_key` to `family_pages`, where `family_key` is the base title without the language suffix (e.g. "Some title" for "Some title (Česky)") and `family_pages` is a list of pages belonging to the family (have the same `family_key`). """ # interlanguage links are not valid for all languages, the invalid # need to be dropped now def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page if case_sensitive is True: _family_key = lambda page: lang.detect_language(page["title"])[0] else: _family_key = lambda page: lang.detect_language(page["title"])[ 0].lower() pages = sorted(pages, key=_family_key) families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key) families = {} for family, pages in families_groups: pages = list(pages) tags = set( lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages) if len(tags) == len(pages): families[family] = pages elif case_sensitive is False: # sometimes case-insensitive matching is not enough, e.g. [[fish]] is # not [[FiSH]] (and neither is redirect) families.update( InterlanguageLinks._group_into_families( pages, case_sensitive=True)) else: # this should never happen raise Exception return families
def add_report_line(self, title, template, message): message = "<nowiki>{}</nowiki> ({})".format(template, message) lang = detect_language(title)[1] if lang not in self.log: self.log[lang] = {} if title in self.log[lang]: self.log[lang][title].append(message) else: self.log[lang][title] = [message]
def add_report_line(self, title, template, message): message = "<nowiki>{}</nowiki> ({})".format(template, message) lang = detect_language(title)[1] if lang not in self.log: self.log[lang] = {} if title in self.log[lang]: self.log[lang][title].append(message) else: self.log[lang][title] = [message]
def localize_flag(wikicode, node, template_name): """ If a ``node`` in ``wikicode`` is followed by a template with the same base name as ``template_name``, this function changes the adjacent template's name to ``template_name``. :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object :param node: a :py:class:`mwparserfromhell.nodes.Node` object :param str template_name: the name of the template flag, potentially including a language name """ parent = get_parent_wikicode(wikicode, node) adjacent = get_adjacent_node(parent, node, ignore_whitespace=True) if isinstance(adjacent, mwparserfromhell.nodes.Template): adjname = lang.detect_language(str(adjacent.name))[0] basename = lang.detect_language(template_name)[0] if canonicalize(adjname) == canonicalize(basename): adjacent.name = template_name
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) margin = 1.6 * len(levels) lev = ".".join(str(x + 1) for x in levels) + "." info = "({})".format(self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.catlink(cat) for cat in parents] info += self.format_also_in(parents, lang_tag) return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) margin = 1.6 * len(levels) lev = ".".join(str(x + 1) for x in levels) + "." info = "({})".format(self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.catlink(cat) for cat in parents] info += self.format_also_in(parents, lang_tag) return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def find_broken(self): def pages_in_namespace(ns): return self.api.generator(generator="allpages", gapfilterredir="nonredirects", gapnamespace=ns, gaplimit="max", prop="categories", cllimit="max", clshow="!hidden") pages = itertools.chain.from_iterable(pages_in_namespace(ns) for ns in self.content_namespaces) needs_fixing = [] for page in pages: langname = lang.detect_language(page["title"])[1] if "categories" in page: for cat in page["categories"]: # skip root categories for non-English languages if page["title"] == "Category:{}".format(langname) and cat["title"] == "Category:Languages": continue # check language if lang.detect_language(cat["title"])[1] != langname: needs_fixing.append(page["pageid"]) return needs_fixing
def _pull_from_page(page, condition=lambda tag, title: True): # default to empty tuple for langlink in page.get("langlinks", ()): tag = langlink["lang"] # conversion back and forth is necessary to resolve redirect full_title = self._title_from_langlink(langlink) title, langname = lang.detect_language(full_title) # TODO: check if the resulting tag is equal to the original? # tag = lang.tag_for_langname(langname) if tag not in tags and condition(tag, title): tags.append(tag) titles.append(title)
def _pull_from_page(page, condition=lambda tag, title: True): # default to empty tuple for langlink in page.get("langlinks", ()): tag = langlink["lang"] # conversion back and forth is necessary to resolve redirect full_title = self._title_from_langlink(langlink) title, langname = lang.detect_language(full_title) # TODO: check if the resulting tag is equal to the original? # tag = lang.tag_for_langname(langname) if tag not in tags and condition(tag, title): tags.append(tag) titles.append(title)
def find_orphans(self): if self.allpages is None: self.build_graph() for page in self.allpages: title = page["title"] # unsupported languages need to be skipped now if not self._is_valid_interlanguage(title): continue langlinks = self._get_langlinks(title) if lang.detect_language(title)[1] != "English" and len(langlinks) == 0: print("* [[{}]]".format(title))
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing page [[{}]]...".format(title)) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue # strip whitespace around the parameter, otherwise it is added to # the link and rendered incorrectly self.strip_whitespace(wikicode, template) hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def check_extlink_status(self, wikicode, extlink, src_title): with self.lock_wikicode: url = self.prepare_url(wikicode, extlink) if url is None: return logger.info("Checking link {} ...".format(extlink)) status = self.check_url(url) with self.lock_wikicode: if status is True: # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code) ensure_unflagged_by_template(wikicode, extlink, "Dead link", match_only_prefix=True) elif status is False: # TODO: handle bbs.archlinux.org (some links may require login) # TODO: handle links inside {{man|url=...}} properly # first replace the existing template (if any) with a translated version flag = self.get_localized_template( "Dead link", lang.detect_language(src_title)[1]) localize_flag(wikicode, extlink, flag) # flag the link, but don't overwrite date and don't set status yet flag = ensure_flagged_by_template(wikicode, extlink, flag, *self.deadlink_params, overwrite_parameters=False) # drop the fragment from the URL before looking into the cache if url.fragment: url = urllib3.util.url.parse_url( url.url.rsplit("#", maxsplit=1)[0]) # overwrite by default, but skip overwriting date when the status matches overwrite = True if flag.has("status"): status = flag.get("status").value if str(status) == str(self.cache_invalid_urls[url]): overwrite = False if overwrite is True: # overwrite status as well as date flag.add("status", self.cache_invalid_urls[url], showkey=True) flag.add("1", self.deadlink_params[0], showkey=False) flag.add("2", self.deadlink_params[1], showkey=False) flag.add("3", self.deadlink_params[2], showkey=False) else: # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls logger.warning( "status check indeterminate for external link {}".format( extlink))
def extract_translations(self, wikicode): dictionary = LowercaseDict() for wikilink in wikicode.ifilter_wikilinks(recursive=True): # skip catlinks without leading colon if not wikilink.title.startswith(":"): continue title = self.api.Title(wikilink.title) if title.namespace == "Category" and wikilink.text: # skip trivial cases to apply our defaults pure, _ = lang.detect_language(title.pagename) if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower(): dictionary[str(title)] = str(wikilink.text).strip() return dictionary
def process_allpages(self, apfrom=None): namespaces = [0, 14] if self.interactive is True: namespaces.append(12) for ns in namespaces: for page in self.api.generator(generator="allpages", gaplimit="100", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom, prop="revisions", rvprop="content|timestamp"): title = page["title"] if lang.detect_language(title)[1] != "English": continue timestamp = page["revisions"][0]["timestamp"] text_old = page["revisions"][0]["*"] text_new, edit_summary = self.update_page(title, text_old) self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) # conversion back-and-forth is necessary to add the "(Language)" suffix into all subpage parts new_langlinks = [] for tag, title in langlinks: new_title = lang.format_title(title, lang.langname_for_tag(tag)) # do it only when the new_title exists, otherwise the title without "(Language)" in # all subpage parts is still valid as per Help:i18n if self._page_exists(new_title): title = lang.detect_language(new_title, strip_all_subpage_parts=False)[0] new_langlinks.append((tag, title)) return new_langlinks
def extract_translations(self, wikicode): dictionary = LowercaseDict() for wikilink in wikicode.ifilter_wikilinks(recursive=True): # skip catlinks without leading colon if not wikilink.title.startswith(":"): continue title = self.api.Title(wikilink.title) if title.namespace == "Category" and wikilink.text: # skip trivial cases to apply our defaults pure, _ = lang.detect_language(title.pagename) if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower(): dictionary[str(title)] = str(wikilink.text).strip() return dictionary
def create_category(self, category): title = self.api.Title(category) if title.iwprefix or title.namespace != "Category": raise ValueError("Invalid category name: [[{}]]".format(category)) # normalize name category = title.fullpagename # skip existing categories if category in self.info: return pure, langname = lang.detect_language(category) if langname == lang.get_local_language(): logger.warning( "Cannot automatically create {} category: [[{}]]".format( lang.get_local_language(), category)) return local = lang.format_title(pure, lang.get_local_language()) if local not in self.info: logger.warning( "Cannot create category [[{}]]: {} category [[{}]] does not exist." .format(category, lang.get_local_language(), local)) return def localized_category(cat, langname): pure, lgn = lang.detect_language(cat) if pure == "Category:Languages": # this terminates the recursive creation return pure elif pure.lower() == "category:" + lgn.lower(): return "Category:{}".format(langname) return lang.format_title(pure, langname) if local in self.parents.keys(): parents = [ localized_category(p, langname) for p in self.parents[local] ] content = "\n".join("[[{}]]".format(p) for p in parents) else: parents = None content = "" self.api.create(title=category, text=content, summary="init wanted category") self.update() if parents is not None: for p in parents: self.create_category(p)
def _group_into_families(pages, case_sensitive=False): """ Takes list of pages and groups them based on their title. Returns a mapping of `family_key` to `family_pages`, where `family_key` is the base title without the language suffix (e.g. "Some title" for "Some title (Česky)") and `family_pages` is a list of pages belonging to the family (have the same `family_key`). """ # interlanguage links are not valid for all languages, the invalid # need to be dropped now def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page if case_sensitive is True: _family_key = lambda page: lang.detect_language(page["title"])[0] else: _family_key = lambda page: lang.detect_language(page["title"])[0].lower() pages = sorted(pages, key=_family_key) families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key) families = {} for family, pages in families_groups: pages = list(pages) tags = set(lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages) if len(tags) == len(pages): families[family] = pages elif case_sensitive is False: # sometimes case-insensitive matching is not enough, e.g. [[fish]] is # not [[FiSH]] (and neither is redirect) families.update(InterlanguageLinks._group_into_families(pages, case_sensitive=True)) else: # this should never happen raise Exception return families
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) # indent output = " " * len(levels) * 4 # level output += ".".join(str(x + 1) for x in levels) # title, number of subpages output += " {} ({})".format(self.localize(title), self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.localize(cat) for cat in parents] output += self.format_also_in(parents, lang_tag) return output
def find_orphans(self): """ Returns list of pages that are alone in their families. """ orphans = [] for page in self.allpages: title = page["title"] # unsupported languages need to be skipped now if not self._is_valid_interlanguage(title): continue langlinks = self.get_langlinks(title) if lang.detect_language(title)[1] != lang.get_local_language() and len(langlinks) == 0: orphans.append(title) return orphans
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) # indent output = " " * len(levels) * 4 # level output += ".".join(str(x + 1) for x in levels) # title, number of subpages output += " {} ({})".format(self.localize(title), self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.localize(cat) for cat in parents] output += self.format_also_in(parents, lang_tag) return output
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing '%s'..." % title) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def generate_pages(self): # handle the trivial case first if self.title is not None: result = self.api.call_api(action="query", prop="revisions", rvprop="content|timestamp", rvslots="main", titles=self.title) yield list(result["pages"].values())[0] return # clone the list of namespaces so that we can modify it for this method namespaces = self.namespaces.copy() # rewind to the right namespace (the API throws BadTitle error if the # namespace of apfrom does not match apnamespace) apfrom = self.first if apfrom is not None: _title = self.api.Title(apfrom) if _title.namespacenumber not in namespaces: logger.error( "Valid namespaces for the --first option are {}.".format( [self.api.site.namespaces[ns] for ns in namespaces])) return while namespaces[0] != _title.namespacenumber: del namespaces[0] # apfrom must be without namespace prefix apfrom = _title.pagename for ns in namespaces: for page in self.api.generator(generator="allpages", gaplimit="100", gapnamespace=ns, gapfrom=apfrom, gapfilterredir=self.apfilterredir, prop="revisions", rvprop="content|timestamp", rvslots="main"): # if the user is not logged in, the limit for revisions may be lower than gaplimit, # in which case the generator will yield some pages multiple times without revisions # before the query-continuation kicks in if "revisions" not in page: continue if self.langnames and lang.detect_language( page["title"])[1] not in self.langnames: continue yield page # the apfrom parameter is valid only for the first namespace apfrom = ""
def find_orphans(self): """ Returns list of pages that are alone in their families. """ orphans = [] for page in self.allpages: title = page["title"] # unsupported languages need to be skipped now if not self._is_valid_interlanguage(title): continue langlinks = self.get_langlinks(title) if lang.detect_language(title)[1] != lang.get_local_language( ) and len(langlinks) == 0: orphans.append(title) return orphans
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing page [[{}]]...".format(title)) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue # strip whitespace around the parameter, otherwise it is added to # the link and rendered incorrectly self.strip_whitespace(wikicode, template) hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) # first unflag since the localized template might change ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True) # flag with a localized template and hint flag = self.get_localized_template("Broken package link", lang) ensure_flagged_by_template(wikicode, template, flag, hint, overwrite_parameters=True) else: ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True) return wikicode
def parse_alsoin(self, title, value): alsoin = {} for item in value.split(","): item = item.strip() try: tag, translation = item.split(":", maxsplit=1) tag = tag.strip() translation = translation.strip() if not lang.is_language_tag(tag): raise ValueError except ValueError: tag = lang.tag_for_langname(lang.detect_language(title)[1]) translation = item alsoin[tag] = translation logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin)) return alsoin
def parse_alsoin(self, title, value): alsoin = {} for item in value.split(","): item = item.strip() try: tag, translation = item.split(":", maxsplit=1) tag = tag.strip() translation = translation.strip() if not lang.is_language_tag(tag): raise ValueError except ValueError: tag = lang.tag_for_langname(lang.detect_language(title)[1]) translation = item alsoin[tag] = translation logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin)) return alsoin
def _get_langlinks(self, full_title): """ Uses :py:meth:`self._titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self._titles_in_family(full_title) langlinks = set(zip(tags, titles)) # remove title of the page to be updated title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def __init__(self, api, cliargs): self.api = api self.cliargs = cliargs if self.cliargs.save is False and self.cliargs.print is False: self.cliargs.print = True if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all": self.cliargs.toc_languages = lang.get_internal_tags() # strip "(Language)" suffix self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0] # detect page titles self.titles = [] for ln in sorted(self.cliargs.toc_languages): if ln == lang.tag_for_langname(lang.get_local_language()): self.titles.append(self.cliargs.toc_page) else: self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
def __init__(self, api, cliargs): self.api = api self.cliargs = cliargs if self.cliargs.save is False and self.cliargs.print is False: self.cliargs.print = True if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all": self.cliargs.toc_languages = lang.get_internal_tags() # strip "(Language)" suffix self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0] # detect page titles self.titles = [] for ln in sorted(self.cliargs.toc_languages): if ln == lang.tag_for_langname(lang.get_local_language()): self.titles.append(self.cliargs.toc_page) else: self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
def update_page_language(api): # ensure that we are authenticated require_login(api) namespaces = [0, 4, 10, 12, 14] for ns in namespaces: for page in api.generator(generator="allpages", gapnamespace=ns, gaplimit="max", prop="info"): title = page["title"] pagelanguage = page["pagelanguage"] pure, langname = lang.detect_language(title) langtag = lang.tag_for_langname(langname) if pagelanguage != langtag: api.set_page_language( title, langtag, "update language based on the page title")
def main(api): templates = [ "Template:Article summary start", "Template:Article summary heading", "Template:Article summary link", "Template:Article summary text", "Template:Article summary wiki", "Template:Article summary end" ] pages_gen = (api.generator(generator="embeddedin", geilimit="max", geititle=title) for title in templates) pages = itertools.chain.from_iterable(pages_gen) titles = set(page["title"] for page in pages) # print only languages for which "Template:Related articles start (<lang>)" exists langs_whitelist = ["English", "Español", "Italiano", "Português", "Česky", "Ελληνικά", "Русский", "正體中文", "简体中文", "한국어"] for title in sorted(titles): # detect language, check whitelist _, lang = detect_language(title) if lang in langs_whitelist: print("* [[%s]]" % title)
def main(api): require_login(api) # check for necessary rights if "unwatchedpages" not in api.user.rights: print( "The current user does not have the 'unwatchedpages' right, which is necessary to use this script. Sorry." ) sys.exit(1) # get list of unwatched pages query_unwatched = { "action": "query", "list": "querypage", "qppage": "Unwatchedpages", "qplimit": "max", "continue": "", } # list flattening, limit to the Main namespace unwatched = (page for snippet in api.query_continue(query_unwatched) for page in snippet["querypage"]["results"] if page["ns"] == 0) # split into sections by language by_language = {} for page in unwatched: title = page["title"] lang = detect_language(title)[1] if lang not in by_language: by_language[lang] = [] by_language[lang].append(title) # print wikitext for lang in sorted(by_language.keys()): print("== %s ==" % lang) print() for title in by_language[lang]: print("* %s" % title) print()
def check_page(self, title): # check the language base, lang = detect_language(title) new_lang = self.lang_map.get(lang) if not new_lang: return # format_title does not work when the script is run before updating the # interwiki table and the ws.ArchWiki.lang module #new_title = format_title(base, new_lang) if title == f"Category:{lang}": new_title = f"Category:{new_lang}" else: new_title = title.replace(f"({lang})", f"({new_lang})") summary = self.edit_summary.format(old_lang=lang, new_lang=new_lang) logger.info(f"Move [[{title}]] to [[{new_title}]] ({summary})") try: self.api.move(title, new_title, summary, movesubpages=False) except APIError: # skip errors pass
def create_category(self, category): title = self.api.Title(category) if title.iwprefix or title.namespace != "Category": raise ValueError("Invalid category name: [[{}]]".format(category)) # normalize name category = title.fullpagename # skip existing categories if category in self.info: return pure, langname = lang.detect_language(category) if langname == lang.get_local_language(): logger.warning("Cannot automatically create {} category: [[{}]]".format(lang.get_local_language(), category)) return local = lang.format_title(pure, lang.get_local_language()) if local not in self.info: logger.warning("Cannot create category [[{}]]: {} category [[{}]] does not exist.".format(category, lang.get_local_language(), local)) return def localized_category(cat, langname): pure, lgn = lang.detect_language(cat) if pure == "Category:Languages": # this terminates the recursive creation return pure elif pure.lower() == "category:" + lgn.lower(): return "Category:{}".format(langname) return lang.format_title(pure, langname) parents = [localized_category(p, langname) for p in self.parents[local]] content = "\n".join("[[{}]]".format(p) for p in parents) self.api.create(title=category, text=content, summary="init wanted category") self.update() for p in parents: self.create_category(p)
def rename_non_english(self): del self.allpages # FIXME: starting with English pages is not very good: # - some pages are omitted (e.g. when two pages link to the same English page, at least warning should be printed) # - it suggests to move e.g. Xfce (Česky) to Xfwm (Česky) because multiple English pages link to it # Therefore we limit it only to categories... for page in self.allpages: title = page["title"] if lang.detect_language(title)[1] == "English" and title.startswith("Category:"): langlinks = self.get_langlinks(title) for tag, localized_title in langlinks: logger.info("Checking [[{}:{}]] for renaming...".format(tag, localized_title)) if lang.is_internal_tag(tag) and localized_title != title: source = "{} ({})".format(localized_title, lang.langname_for_tag(tag)) target = "{} ({})".format(title, lang.langname_for_tag(tag)) if self._page_exists(target): logger.warning("Cannot move page [[{}]] to [[{}]]: target page already exists".format(source, target)) else: # interactive mode is necessary because this assumes that all English pages are named correctly ans = ask_yesno("Move page [[{}]] to [[{}]]?".format(source, target)) if ans is True: summary = "comply with [[Help:I18n#Page titles]] and match the title of the English page" self.api.move(source, target, summary)
def parse_toc_table(self, title, toc_table): # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] category_names = LowercaseDict() alsoin = {} if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # parse data-toc-alsoin attribute if toc_table.has("data-toc-alsoin"): alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value)) elif columns != ["en"]: logger.warning("Page [[{}]]: missing 'also in' translations".format(title)) # extract localized category names (useful even for PlainFormatter) category_names = self.extract_translations(toc_table.contents) return columns, category_names, alsoin
def main(api): require_login(api) # check for necessary rights if "unwatchedpages" not in api.user.rights: print("The current user does not have the 'unwatchedpages' right, which is necessary to use this script. Sorry.") sys.exit(1) # get list of unwatched pages query_unwatched = { "action": "query", "list": "querypage", "qppage": "Unwatchedpages", "qplimit": "max", "continue": "", } # list flattening, limit to the Main namespace unwatched = (page for snippet in api.query_continue(query_unwatched) for page in snippet["querypage"]["results"] if page["ns"] == 0) # split into sections by language by_language = {} for page in unwatched: title = page["title"] lang = detect_language(title)[1] if lang not in by_language: by_language[lang] = [] by_language[lang].append(title) # print wikitext for lang in sorted(by_language.keys()): print("== %s ==" % lang) print() for title in by_language[lang]: print("* %s" % title) print()
def update_page(self, src_title, text): """ Parse the content of the page and call various methods to update the links. :param str src_title: title of the page :param str text: content of the page :returns: a (text, edit_summary) tuple, where text is the updated content and edit_summary is the description of performed changes """ # FIXME: ideally "DeveloperWiki:" would be a proper namespace if lang.detect_language(src_title)[0] in self.skip_pages or src_title.startswith("DeveloperWiki:"): logger.info("Skipping blacklisted page [[{}]]".format(src_title)) return text, "" logger.info("Parsing page [[{}]] ...".format(src_title)) # FIXME: skip_style_tags=True is a partial workaround for https://github.com/earwig/mwparserfromhell/issues/40 wikicode = mwparserfromhell.parse(text, skip_style_tags=True) summary_parts = [] summary = get_edit_checker(wikicode, summary_parts) for extlink in wikicode.ifilter_external_links(recursive=True): # skip links inside article status templates parent = wikicode.get(wikicode.index(extlink, recursive=True)) if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates: continue with summary("replaced external links"): self.update_extlink(wikicode, extlink) for wikilink in wikicode.ifilter_wikilinks(recursive=True): # skip links inside article status templates parent = wikicode.get(wikicode.index(wikilink, recursive=True)) if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates: continue self.update_wikilink(wikicode, wikilink, src_title, summary_parts) for template in wikicode.ifilter_templates(recursive=True): # skip templates that may be added or removed if str(template.name) in {"Broken section link", "Dead link"}: continue # skip links inside article status templates parent = wikicode.get(wikicode.index(template, recursive=True)) if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates: continue _pure_template = lang.detect_language(str(template.name))[0] if _pure_template.lower() in {"related", "related2"}: target = template.get(1).value # temporarily convert the {{Related}} to wikilink to reuse the update code wl = mwparserfromhell.nodes.wikilink.Wikilink(target) wikicode.replace(template, wl) # update self.update_wikilink(wikicode, wl, src_title, summary_parts) # replace back target.value = str(wl.title) wikicode.replace(wl, template) elif template.name.lower() == "man": with summary("updated man page links"): self.update_man_template(wikicode, template) # deduplicate and keep order parts = set() parts_add = parts.add summary_parts = [part for part in summary_parts if not (part in parts or parts_add(part))] edit_summary = ", ".join(summary_parts) if self.interactive is True: edit_summary += " (interactive)" return str(wikicode), edit_summary
def print_titles(titles): for title in sorted(titles): if lang.detect_language(title)[1] == "English": print("* [[%s]]" % title) print()
#! /usr/bin/env python3 import os.path from collections import namedtuple import itertools from ws.client import API import ws.ArchWiki.lang as lang api_url = "https://wiki.archlinux.org/api.php" cookie_path = os.path.expanduser("~/.cache/ArchWiki.cookie") api = API(api_url, cookie_file=cookie_path, ssl_verify=True) Page = namedtuple("Page", ["title", "langname", "pure"]) pages = [] for page in api.generator(generator="allpages", gaplimit="max", gapfilterredir="nonredirects"): pure, langname = lang.detect_language(page["title"]) pages.append(Page(page["title"], langname, pure)) pages.sort(key=lambda page: (page.langname, page.pure)) groups = itertools.groupby(pages, key=lambda page: page.langname) for langname, pages in groups: print("== {} ==\n".format(langname)) for page in pages: print("* [[:{}|{}]]".format(page.title, page.pure)) print()