def _add_page_from_template(self, namespace, title, template_name, template_parameters): """Add a page by substituting a template. Parameters ---------- namespace : str Namespace of the page. If None, the default namespace will be used. title : str The title of the page. template_name : str The name of the template to substitute to create the subpage. template_parameters : list or OrderedDict Parameters to pass to the template. """ if namespace is None: page = Page(self._site, title) else: page = Page(self._site, title, namespace) if page.exists() and not self._overwrite: logging.warning( "Page '{}' already exists. It will not be created.".format( page.title())) else: template = Template(template_name, True, template_parameters) page.text = template.multiline_string() logging.info("Writing to page '{}'.".format(page.title())) logging.debug(page.text) self._write_page(page)
def userPut( self, page: pywikibot.Page, oldtext: str, newtext: str, summary: Optional[str] = None, minor: bool = True, botflag: Optional[bool] = None, ) -> None: if oldtext == newtext: pywikibot.output("No changes were needed on %s" % page.title(as_link=True)) return pywikibot.output("\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title(as_link=True)) pywikibot.showDiff(oldtext, newtext) if summary: pywikibot.output("Summary: %s" % summary) page.text = newtext try: page.save(summary=summary, minor=minor, botflag=botflag) except pywikibot.EditConflict: raise except pywikibot.Error as e: pywikibot.output("Failed to save %s: %r: %s" % (page.title(as_link=True), e, e))
def task(self): list_platzhalter = [] list_protected = [] lemma_list = self.get_list() for idx, item in enumerate(lemma_list): lemma = Page(self.wiki, item["title"]) if self.is_protected(lemma): list_protected.append(lemma.title()) lemma.protect(protections={ "edit": "autoconfirmed", "move": "autoconfirmed" }, reason="is now common") categories = [item.title() for item in lemma.categories()] if "Kategorie:RE:Platzhalter" in categories: list_platzhalter.append(lemma.title()) self.logger.info( f"{idx}/{len(lemma_list)} prot: {len(list_protected)}, plat: {len(list_platzhalter)} {lemma.title()}" ) page_protected = Page(self.wiki, "Benutzer:THE IT/RE/Arthur Stein/protected") page_protected.text = self.join_lists(list_protected) page_protected.save() page_platzhalter = Page(self.wiki, "Benutzer:THE IT/RE/Arthur Stein/platzhalter") page_platzhalter.text = self.join_lists(list_platzhalter) page_platzhalter.save() return True
def process(day): """ one day bot processing arguments: day -- python date format """ if params.verbose: print("processing Journal des recréations ({day})".format( day=format_date(day))) start = to_date(day) end = to_date(day + ONE_DAY) result = "\n\n== {} ==\n".format(format_date(day)) comment = [] for i, page in enumerate(creation_log(start, end), 1): gras = '' date = '' if params.verbose: print(i, page["timestamp"]) dl = deletelog(page["title"]) if dl: page_pas = Page(Site(), "Discussion:" + page["title"] + "/Suppression") if page_pas.isRedirectPage(): page_pas = page_pas.getRedirectTarget() if page_pas.exists() and re.search(r'article supprimé', page_pas.get(), re.I): if re.search( r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I): date = u' de %s' % re.search( r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I).group(1) comment.append(u'[[%s]] (malgré [[%s|PàS]]%s)' % (page["title"], page_pas.title(), date)) gras = "'''" r = ( u"* {g}{{{{a-court|{title}}}}} <small>([[{pas}|PàS]])</small> supprimé le {date} puis recréé par {{{{u|{user}}}}}{g} \n" .format(title=wiki_param(page["title"]), pas=page_pas.title(), user=wiki_param(page["user"]), date=format_date(from_date(dl["timestamp"])), g=gras)) if params.verbose: print(r) result += r page = Page(Site(), params.prefix + u'/' + format_date(day, skip_day=True)) try: result = page.get() + result except NoPage: result = u'{{mise à jour bot|Zérobot}}' + result if comment: comment.insert(0, '') page.put( result, comment="Journal des recréations ({day}) ".format(day=format_date(day)) + ' - '.join(comment))
def _add_category_page(self, title, categories): """Add a page with categories. Parameters ---------- title : str Title of the page. categories : list The categories to add to the page. """ page = Page(self._site, title, "Category") if page.exists() and not self._overwrite: logging.warning( "Category page '{}' already exists. It will not be created.". format(page.title()) # noqa: E501 ) else: page.text = "" for category in categories: if category != title: page.text += "[[Kategori:{}]]\n".format(category) logging.info("Writing to category page '{}'".format(page.title())) logging.debug(page.text) self._write_page(page)
def process(day): """ one day bot processing arguments: day -- python date format """ if params.verbose: print("processing Journal des recréations ({day})".format(day=format_date(day))) start = to_date(day) end = to_date(day+ONE_DAY) result = "\n\n== {} ==\n".format(format_date(day)) comment = [] for i,page in enumerate(creation_log(start,end),1): gras = '' date = '' if params.verbose: print (i,page["timestamp"]) dl = deletelog(page["title"]) if dl: page_pas = Page(Site(), "Discussion:" + page["title"] + "/Suppression") if page_pas.isRedirectPage(): page_pas = page_pas.getRedirectTarget() if page_pas.exists() and re.search(r'article supprimé', page_pas.get(), re.I): if re.search(r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I): date = u' de %s' % re.search(r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I).group(1) comment.append(u'[[%s]] (malgré [[%s|PàS]]%s)' % (page["title"], page_pas.title(), date)) gras = "'''" r = (u"* {g}{{{{a-court|{title}}}}} <small>([[{pas}|PàS]])</small> supprimé le {date} puis recréé par {{{{u|{user}}}}}{g} \n" .format(title = wiki_param(page["title"]), pas = page_pas.title(), user = wiki_param(page["user"]), date = format_date(from_date(dl["timestamp"])), g = gras)) if params.verbose: print(r) result += r page = Page(Site(), params.prefix + u'/' + format_date(day, skip_day=True)) try: result = page.get() + result except NoPage: result = u'{{mise à jour bot|Zérobot}}' + result if comment: comment.insert(0, '') page.put(result,comment="Journal des recréations ({day}) ".format(day=format_date(day)) + ' - '.join(comment))
def get_topic_articles(page): # construct the memory datastore: topic = {} topic['title'] = page.title() # I don't want no stubs if len(page.text) < 500: return topic['content'] = page.text try: edition_links = [link for link in pagegenerators.LanguageLinksPageGenerator(page)] except: return if edition_links == []: return topic['language'] = {} for link in edition_links: lang_code = str(link.site)[-2:] if lang_code in langs.keys(): # I don't want no stubs if len(link.text) < 500: continue page = Page(link) topic['language'][lang_code] = { 'title': page.title(), 'orig_content': page.text, 'translated_content': translate(page.text) } microsoft_char_counter(len(page.text)) if topic['language'] == {}: return return topic
def __init__(self, page: pywikibot.Page): # general self.shorttitle = page.title(without_brackets=True) self.norefstext = self._refremove(page.text) self.test = False # set to true for test outputs # first paragraph (lead) info self.firstpar = self._firstpar(self.norefstext) self.leadname = self._leadname(self.firstpar) if self.firstpar else None self.leadbday = re.sub(self.cleandayR, '', self._leadbday()) if self._leadbday() else None self.leadbyear = self._leadbyear() self.leaddday = re.sub(self.cleandayR, '', self._leaddday()) if self._leaddday() else None self.leaddyear = self._leaddyear() # categories info self.catbyear = self._catbyear(self.norefstext) self.catdyear = self._catdyear(self.norefstext) # infobox info self.infoboxtitle, self.infoboxparams = self._listinfoboxes(self.norefstext) self.infoboxbday = re.sub(self.cleandayR, '', self._infoboxbday()) if self._infoboxbday() else None self.infoboxbyear = self._infoboxbyear() if self.infoboxexists else None self.infoboxdday = re.sub(self.cleandayR, '', self._infoboxdday()) if self._infoboxdday() else None self.infoboxdyear = self._infoboxdyear() if self.infoboxexists else None self.infoboxname = self._infoboxname() if self.infoboxexists else None # results self.isconflicted = self.nameconflict or self.birthdayconflict or self.deathdayconflict
def _get_img_path(img: Page, img_dir: Path) -> Tuple[str, Path, Path]: img_name = unquote(img.title(with_ns=False, as_url=True)) img_name_valid = hashlib.md5(img_name.encode('utf-8')).hexdigest() img_path = img_dir / (img_name_valid + ".jpg") img_path_orig = Path( str(img_path) + "_" + Path(img_name).suffix + ".ORIGINAL") return img_name, img_path, img_path_orig
def filter_function(article: pywikibot.Page): def strip_accents(s): return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) word = strip_accents(article.title()).lower().replace('’', ' ') return not word[0].isdigit( ) and word >= 'il ne faut pas dire, fontaine, je ne boirai pas de ton eau'
def check_page(self, pagename): """Check one page.""" pywikibot.output("\nChecking %s" % pagename) sys.stdout.flush() page1 = Page(self.original, pagename) txt1 = page1.text if self.options.dest_namespace: dest_ns = int(self.options.dest_namespace) else: dest_ns = None for site in self.sites: if dest_ns is not None: page2 = Page(site, page1.title(withNamespace=False), dest_ns) pywikibot.output("\nCross namespace, new title: %s" % page2.title()) else: page2 = Page(site, pagename) if page2.exists(): txt2 = page2.text else: txt2 = '' if str(site) in config.replicate_replace: txt_new = multiple_replace(txt1, config.replicate_replace[str(site)]) if txt1 != txt_new: pywikibot.output( 'NOTE: text replaced using config.sync_replace') pywikibot.output('%s %s %s' % (txt1, txt_new, txt2)) txt1 = txt_new if txt1 != txt2: pywikibot.output("\n %s DIFFERS" % site) self.differences[site].append(pagename) if self.options.replace: page2.text = txt1 page2.save(self.put_message(site)) else: sys.stdout.write('.') sys.stdout.flush()
def check_page(self, pagename): """Check one page.""" pywikibot.output('\nChecking ' + pagename) sys.stdout.flush() page1 = Page(self.original, pagename) txt1 = page1.text if self.options.dest_namespace: dest_ns = int(self.options.dest_namespace) else: dest_ns = None for site in self.sites: if dest_ns is not None: page2 = Page(site, page1.title(with_ns=False), dest_ns) pywikibot.output('\nCross namespace, new title: ' + page2.title()) else: page2 = Page(site, pagename) if page2.exists(): txt2 = page2.text else: txt2 = '' if str(site) in config.replicate_replace: txt_new = multiple_replace(txt1, config.replicate_replace[str(site)]) if txt1 != txt_new: pywikibot.output( 'NOTE: text replaced using config.sync_replace') pywikibot.output('{0} {1} {2}'.format(txt1, txt_new, txt2)) txt1 = txt_new if txt1 != txt2: pywikibot.output('\n {0} DIFFERS'.format(site)) self.differences[site].append(pagename) if self.options.replace: page2.text = txt1 page2.save(self.put_message(site)) else: sys.stdout.write('.') sys.stdout.flush()
def scrapePage(page: pywikibot.Page) -> None: """Scrape a page's infoboxes and redirects, save them in the `state`.""" pageData: Any = {'infoboxes': [], 'redirects': {}} # Iterate over {{infobox journal}}s on `page`. for infobox in getInfoboxJournals(page): print('I', end='', flush=True) pageData['infoboxes'].append(infobox) if 'title' in infobox and infobox['title'] != '': state.saveTitleToAbbrev(infobox['title']) checkDBAbbrevs(page.title(), infobox) # Iterate over pages that are redirects to `page`. for r in getRedirectsToPage(page.title(), namespaces=0, total=100, content=True): print('R', end='', flush=True) pageData['redirects'][r.title()] = r.text # r.getRedirectTarget().title() state.savePageData(page.title(), pageData) state.saveTitleToAbbrev(abbrevUtils.stripTitle(page.title())) print('', flush=True)
def process(day): """ one day bot processing arguments: day -- python date format """ if params.verbose: print("processing Journal des recréations ({day})".format(day=format_date(day))) start = to_date(day) end = to_date(day+ONE_DAY) result = "\n== {} ==\n".format(format_date(day)) comment = '' for i,page in enumerate(creation_log(start,end),1): gras = '' if params.verbose: print (i,page["timestamp"]) dl = deletelog(page["title"]) if dl: page_pas = Page(Site(), "Discussion:"+page["title"]+"/Suppression") if page_pas.exists() and re.search('\{\{\ ?Article supprimé', page_pas.get(), re.I): comment += u' - %s (malgré [[%s|PàS]])' % (page["title"], page_pas.title()) gras = "'''" r = ("* {g}{{{{a-court|{title}}}}} <small>([[{pas}|PàS]])</small> supprimé le {date} recréé par {{{{u|{user}}}}}{g} \n" .format(title = wiki_param(page["title"]) , pas = page_pas.title()), user = wiki_param(page["user"]), date = format_date(from_date(dl["timestamp"])), g = gras) if params.verbose: print(r) result += r page = Page(Site(), params.prefix+"/"+format_date(day,skip_day=True)) try: result = page.get()+result except NoPage: pass page.put(result,comment="Journal des recréations ({day})".format(day=format_date(day)) + comment)
def get_page_from_size(page: pywikibot.Page) -> pywikibot.Page: """Return a page based on the current page size.""" i = 1 title = page.title() while True: if not page.exists(): break if len(page.text) < 1e6: break i += 1 page = Page(page.site, f"{title} ({i:02d})") return page
def ParseWikiPagePushInfo(page: pywikibot.Page): parsed_text = ParsePage(page) # If the score of a trivia is higher than this, # we'll try to show it only, without leading text. triviaSignificance = float(GetConfig("Wiki", "PushedTermsTTL", 180)) # Distill text bareTitle = BareDisambigTitle(page.title()) distilled = WikiPageDistiller.DistillHtml(parsed_text) info = WikiPagePushInfo(page.title(), page.full_url()) if distilled.trivia != None: # Trivia only info.postText = distilled.trivia # Leading + trivia if distilled.triviaScore < triviaSignificance or not bareTitle in info.postText: info.postText = distilled.introduction + info.postText else: # Leading info.postText = distilled.introduction #elif len(distilled.introduction) < 50 : #info.post # Choose cover image info.postImageName, info.postImageUrl = GetCoverImage(page) return info
def _create_current_projects_template(self): """Create a current projects template with the new projects.""" page_name = self._make_year_title( self._config["year_pages"]["current_projects_template"]) page = Page(self._site, page_name) if page.exists() and not self._overwrite: logging.warning( "Page '{}' already exists. It will not be created.".format( page.title())) return project_format = "[[{ns}:{{proj}}|{{proj}}]]".format( ns=self._config["project_namespace"]) delimiter = "''' · '''" template_data = {} for program in self._programs: projects = set() for strategy in program.get('strategies'): # projects sorted by id to get thematic grouping projects.update(strategy.get("projects")) template_data[program.get('name')] = delimiter.join([ project_format.format(proj=self._projects[project]) for project in sorted(projects) ]) template = Template("Aktuella projekt/layout") template.add_parameter("år", self._year) template.add_parameter("access", template_data["Tillgång"]) template.add_parameter("use", template_data["Användning"]) template.add_parameter("community", template_data["Gemenskapen"]) template.add_parameter("enabling", template_data["Möjliggörande"]) page.text = template.multiline_string() + \ "\n<noinclude>{{Dokumentation}}</noinclude>" logging.info("Writing to page '{}'.".format(page.title())) logging.debug(page.text) self._write_page(page)
def feed_archive(self, archive: pywikibot.Page, thread: DiscussionThread, max_archive_size: Size, params=None) -> bool: """ Feed the thread to one of the archives. Also check for security violations. @return: whether the archive is full """ archive_page = self.get_archive_page(archive.title(with_ns=True), params) return archive_page.feed_thread(thread, max_archive_size)
def get_plain_text(self, page: pywikibot.Page): params = { 'action': 'query', 'prop': 'extracts', 'exsentences': 7, 'explaintext': 1, 'format': 'json', 'titles': page.title() } request = self.site._simple_request(**params) response = request.submit() try: return self.parse_text(next(iter(response['query']['pages'].values()), None)['extract']) except (KeyError, TypeError): pass
def template_title_regex(tpl_page: pywikibot.Page) -> Pattern: """ Return a regex that matches to variations of the template title. It supports the transcluding variant as well as localized namespaces and case-insensitivity depending on the namespace. :param tpl_page: The template page :type tpl_page: pywikibot.page.Page """ ns = tpl_page.site.namespaces[tpl_page.namespace()] marker = '?' if ns.id == 10 else '' title = tpl_page.title(with_ns=False) title = case_escape(ns.case, title) return re.compile(r'(?:(?:%s):)%s%s' % ('|'.join(ns), marker, title))
def delete_page(page: pywikibot.Page, summary: str) -> None: """Delete the page and dependent pages.""" page.delete(reason=summary, prompt=False) if page.exists(): return page_link = page.title(as_link=True) for redirect in page.backlinks(filter_redirects=True): redirect.delete(reason=SUMMARIES['redirect'].format(page_link), prompt=False) talk_page = page.toggleTalkPage() if talk_page.exists(): talk_page.delete(reason=SUMMARIES['talk'].format(page_link), prompt=False) talk_link = talk_page.title(as_link=True) for redirect in talk_page.backlinks(filter_redirects=True): redirect.delete(reason=SUMMARIES['redirect'].format(talk_link), prompt=False)
def template_title_regex(tpl_page: pywikibot.Page) -> Pattern: """ Return a regex that matches to variations of the template title. It supports the transcluding variant as well as localized namespaces and case-insensitivity depending on the namespace. @param tpl_page: The template page @type tpl_page: pywikibot.page.Page """ ns = tpl_page.site.namespaces[tpl_page.namespace()] marker = '?' if ns.id == 10 else '' title = tpl_page.title(with_ns=False) if ns.case != 'case-sensitive': title = '[{}{}]{}'.format(re.escape(title[0].upper()), re.escape(title[0].lower()), re.escape(title[1:])) else: title = re.escape(title) return re.compile(r'(?:(?:%s):)%s%s' % ('|'.join(ns), marker, title))
def GetCoverImage(page: pywikibot.Page): ''' Gets the cover image name and url for a specific Page. Returns (None, None) if no cover image is found. ''' try: return page.__lmd_cover_image except: pass req = page.site._simple_request(action="query", titles=page.title(), prop="pageimages", piprop="thumbnail|name", pithumbsize=400) data = req.submit() assert "query" in data, "API request response lacks 'query' key" assert "pages" in data["query"], "API request response lacks 'pages' key" _, jpage = data["query"]["pages"].popitem() if "thumbnail" in jpage: page.__lmd_cover_image = (jpage["pageimage"], jpage["thumbnail"]["source"]) else: page.__lmd_cover_image = (None, None) return page.__lmd_cover_image
def handle_maariv_paper_page(paper_page: pw.Page) -> None: publish_date = _extract_paper_page_from_title(paper_page.title()) if publish_date is None: return parsed_mw_text = mwparserfromhell.parse(paper_page.text) paper_template = parsed_mw_text.filter_templates( mark_as_paper_template_name)[0] if publish_date_param_name in paper_template: logger.info( f'Page: {paper_page} is already marked with publish date, skipping this paper' ) return paper_template.add(publish_date_param_name, publish_date) logger.info( f'Added publish date: {publish_date} for page: {paper_page.title()}') paper_page.text = parsed_mw_text if SHOULD_SAVE: paper_page.save( summary="MaccabiBotAdd publish dates for maariv papers", botflag=True)
def add_project_page(self, phab_id, phab_name, parameters, goals, goal_fulfillments): """Add the main project page. Parameters ---------- name : str The project name in Swedish. This will be used as title for the page. description : str Passed to template as parameter "beskrivning". partners : str Passed to template as parameter "samarbetspartners". """ name = parameters[self._project_columns["swedish_name"]] page = Page(self._site, name, self._config["project_namespace"]) if page.exists() and not self._overwrite: logging.warning( "Project page '{}' already exists. It will not be created.". format(page.title()) # noqa: E501 ) else: template = Template(self._config["project_template"], True) project_parameters = self._config["project_parameters"].items() for template_parameter, label in project_parameters: template.add_parameter( template_parameter, parameters[self._project_columns[label]]) template.add_parameter("year", self._year) template.add_parameter("phabricatorId", phab_id) template.add_parameter("phabricatorName", phab_name) template.add_parameter("bot", "ja") content = "{}".format(template) page.text = content logging.info("Writing to project page '{}'".format(page.title())) logging.debug(page.text) self._write_page(page) for subpage in self._config["subpages"]: subpage_parameters = { "år": self._year # always pass the year parameter } if "parameters" in subpage: for key, label in subpage["parameters"].items(): subpage_parameters[key] = parameters[ self._project_columns[label]] if "add_goals_parameters" in subpage: # Special case for goals parameters, as they are not # just copied. template_key = \ list(subpage["add_goals_parameters"].keys())[0] template_value = self._make_year_title( subpage["add_goals_parameters"][template_key]) subpage_parameters[template_key] = \ Template(template_value, parameters=goals) subpage_parameters["måluppfyllnad"] = \ self._create_goal_fulfillment_text( goals.keys(), goal_fulfillments ) # noqa:E123 self._add_subpage(name, subpage["title"], subpage["template_name"], subpage_parameters)
def skip_page(self, page: pywikibot.Page) -> bool: """Sikp the page if it is not an SVG.""" if not isinstance(page, pywikibot.FilePage) or not page.title( with_ns=False).lower().endswith('.svg'): return True return super().skip_page(page)
version_history = page.fullVersionHistory()[::-1] size_all_changes = 0 for idx_rev, revision in enumerate(version_history): user = revision.user if user == 'Pfaerrich': if idx_rev > 0: size_prev = len(version_history[idx_rev-1].text) else: size_prev = 0 size_all_changes += abs(len(version_history[idx_rev].text) - size_prev) korrigiert_flag = False if size_all_changes > 0: for version in page.getVersionHistory(): if version.user == 'Pfaerrich': if re.search('orrigiert', version.comment): korrigiert_flag = True break print(size_all_changes, len(page.text), korrigiert_flag) if (size_all_changes / len(page.text)) < 0.03 and not korrigiert_flag: list_for_pfaerrich.append([page.title(), size_all_changes, len(page.text)]) report_page = Page(wiki, 'Benutzer:THEbotIT/List_for_Pfaerrich') header = '{|class="wikitable sortable"\n! Lemma\n! Größe\n! geändert von dir' text = [] for line in list_for_pfaerrich: text.append('|-\n|[[{lemma}]]\n|{size}\n|{changes}'.format(lemma=line[0], size=line[2], changes=line[1])) text = '\n'.join(text) text = '{header}\n{text}\n|}}'.format(header=header, text=text) report_page.text = text report_page.save(botflag=True, summary='blub')
if user == 'Pfaerrich': if idx_rev > 0: size_prev = len(version_history[idx_rev - 1].text) else: size_prev = 0 size_all_changes += abs( len(version_history[idx_rev].text) - size_prev) korrigiert_flag = False if size_all_changes > 0: for version in page.getVersionHistory(): if version.user == 'Pfaerrich': if re.search('orrigiert', version.comment): korrigiert_flag = True break print(size_all_changes, len(page.text), korrigiert_flag) if (size_all_changes / len(page.text)) < 0.03 and not korrigiert_flag: list_for_pfaerrich.append( [page.title(), size_all_changes, len(page.text)]) report_page = Page(wiki, 'Benutzer:THEbotIT/List_for_Pfaerrich') header = '{|class="wikitable sortable"\n! Lemma\n! Größe\n! geändert von dir' text = [] for line in list_for_pfaerrich: text.append('|-\n|[[{lemma}]]\n|{size}\n|{changes}'.format( lemma=line[0], size=line[2], changes=line[1])) text = '\n'.join(text) text = '{header}\n{text}\n|}}'.format(header=header, text=text) report_page.text = text report_page.save(botflag=True, summary='blub')
def process_page(self, page: Page): page_text = page.get(force=True) parsed = mwparserfromhell.parse(page_text) year = None month = None day = None entry = None for template in parsed.filter_templates(): if (template.name.matches('Dyktalk') or template.name.matches('DYK talk')) and ( not template.has('entry') or len(template.get('entry').value) == 0): if year is None: if (not template.has(1)) or (not template.has(2)): print('Skipping {{DYK talk}} page', page, ', no date found') continue print('*', page.title(), template.get(2), template.get(1)) year = template.get(2).value.strip() day, month = template.get(1).value.strip().split(' ') if entry is None: entry = self.get_entry_for_page(year, month, day, page) if entry: print('Adding entry', entry, 'to {{DYK talk}}') template.add('entry', entry) elif (template.name.matches('ArticleHistory') or template.name.matches('Article history')) and ( not template.has('dykentry') or len(template.get('dykentry').value) == 0): if year is None: if not template.has('dykdate'): print('Skipping {{ArticleHistory}} on page', page, ', no date found') continue date = template.get('dykdate').value.strip() print('*', page.title(), date) if ' ' in date: # monthName YYYY if date.count(' ') == 1: date = '1 ' + date day, month, year = date.split(' ')[:3] elif '-' in date: year, month, day = date.split('-')[:3] month = datetime.date(1900, int(month), 1).strftime('%B') else: print('Skipping {{ArticleHistory}} on page', page, ", can't parse date", date) continue print(page.title(), year, month, day) if entry is None: entry = self.get_entry_for_page(year, month, day, page) if entry: print('Adding entry', entry, 'to {{ArticleHistory}}') template.add('dykentry', entry, before='dykdate') if entry: new_text = str(parsed) if (new_text != page.text and self.should_edit() and (not self.is_manual_run or confirm_edit())): self.get_mediawiki_api().get_site().login() page.text = str(parsed) page.save( self.get_task_configuration('missing_blurb_edit_summary'), botflag=self.should_use_bot_flag(), ) self.record_trial_edit() return True return False
def process_wikipage(self, wikipage: pywikibot.Page, language: str): content = wikipage.get() title = wikipage.title() return self.process_non_wikipage(title, content, language)
def makeLanguageMismatchPatch( page: pywikibot.Page, infoboxId: int, infoboxAbbrev: str, computedAbbrev: str, matchingPatterns: str ) -> Optional[Dict[str, Any]]: """Make patchset for Stitchpitch: infobox param and redirects rcats.""" from unicodedata import normalize import mwparserfromhell startTimeStamp = datetime.now(timezone.utc).isoformat() diff = datetimeFromPWB(Site().server_time()) - datetime.now(timezone.utc) if diff > timedelta(minutes=2) or -diff > timedelta(minutes=2): raise Exception('Local zone misconfigured or server timezone not UTC!') latestRevision = page.latest_revision mainEdit = { 'patchtype': 'edit', # implies 'nocreate': True 'slug': f'{infoboxAbbrev} → {computedAbbrev}', 'details': matchingPatterns, 'title': page.title(), 'summary': 'Fix ISO-4 abbreviation to use all language rules.', 'minor': True, 'basetimestamp': datetimeFromPWB(latestRevision.timestamp).isoformat(), 'starttimestamp': startTimeStamp, 'oldtext': latestRevision.text, 'oldrevid': latestRevision.revid } if datetime.fromisoformat(mainEdit['basetimestamp']) > \ datetime.fromisoformat(startTimeStamp) - timedelta(hours=5): print(f'Skipping patch for "{page.title()}":' f' edited a short while ago ago.') return None code = mwparserfromhell.parse(normalize('NFC', latestRevision.text)) foundInfobox = None # type: Optional[mwparserfromhell.Template] foundId = -1 for t in code.filter_templates(): if t.name.matches('infobox journal') or \ t.name.matches('Infobox Journal'): foundId += 1 if foundId == infoboxId: foundInfobox = t break if not foundInfobox: print(f'Skipping patch for "{page.title()}":' f' infobox #{infoboxId} not found.') return None foundAbbrev = str(foundInfobox.get('abbreviation').value) if foundAbbrev.strip() != infoboxAbbrev: print(f'Skipping patch for "{page.title()}":' f' infobox abbrev mismatch (comments?).') return None foundInfobox.get('abbreviation').value = \ foundAbbrev.replace(infoboxAbbrev, computedAbbrev, 1) mainEdit['text'] = str(code) patches = [mainEdit] groupDetails = '' regex = r' *{{\s*(r|R) from ISO ?4( abbreviation)?\s*}} *\n?' abbrevRegex = r'{{\s*(r|R)(edirect)? (from )?(common )?ab[a-z]*\s*}}' for rPage in getRedirectsToPage(page.title(), namespaces=0, total=100, content=True): rTitle = rPage.title() rRevision = rPage.latest_revision cAbbrev = abbrevUtils.stripTitle(computedAbbrev.lower()) if cAbbrev + ' ' in rTitle.lower() + ' ' or \ cAbbrev.replace('.', '') + ' ' in rTitle.lower() + ' ': newtext = rRevision.text if re.search(regex, newtext): print(f'Skipping patch for existing page, already marked: {rTitle}') groupDetails += 'ok: ' + rTitle + '\n' continue if not isReplaceableRedirect(rRevision.text, page.title(), RCatSet.ISO4): print(f'Skipping patch for unreplaceable page: {rTitle}') groupDetails += 'unrepl: ' + rTitle + '\n' continue if re.search(abbrevRegex, newtext): newtext = re.sub(abbrevRegex, '{{R from ISO 4}}', newtext, 1) else: newtext += '\n{{R from ISO 4}}' markPatch = { 'patchtype': 'edit', 'slug': 'mark new?', 'title': rTitle, 'summary': 'Fix ISO-4 abbreviation to use all language rules.', 'minor': True, 'basetimestamp': datetimeFromPWB(rRevision.timestamp).isoformat(), 'starttimestamp': startTimeStamp, 'oldtext': rRevision.text, 'oldrevid': rRevision.revid, 'text': newtext } patches.append(markPatch) elif re.search(regex, rRevision.text): unmarkPatch = { 'patchtype': 'edit', 'slug': 'unmark old', 'title': rTitle, 'summary': 'Fix ISO-4 abbreviation to use all language rules.', 'minor': True, 'basetimestamp': datetimeFromPWB(rRevision.timestamp).isoformat(), 'starttimestamp': startTimeStamp, 'oldtext': rRevision.text, 'oldrevid': rRevision.revid, 'text': re.sub(regex, '{{R from abbreviation}}\n', rRevision.text) } if infoboxAbbrev.lower() in rTitle.lower() or \ infoboxAbbrev.replace('.', '').lower() in rTitle.lower(): patches.append(unmarkPatch) else: print(f'Skip patch unmark on unrecog ISO-4: {rTitle}') groupDetails += 'unrecog ISO-4: ' + rTitle + '\n' else: groupDetails += '??: ' + rTitle + '\n' shouldHave = [computedAbbrev] if computedAbbrev.replace('.', '') != computedAbbrev: shouldHave.append(computedAbbrev.replace('.', '')) for abbrev in shouldHave: rPage = pywikibot.Page(Site(), abbrev) if not rPage.exists(): createPatch = { 'patchtype': 'create', 'slug': 'create', 'title': rPage.title(), 'summary': 'R from ISO-4 abbreviation of journal title.', 'minor': True, 'starttimestamp': startTimeStamp, 'text': '#REDIRECT[[' + page.title() + ']]\n\n' '{{R from ISO 4}}\n' } patches.append(createPatch) return { 'patchtype': 'group', 'slug': f'{infoboxAbbrev} → {computedAbbrev}', 'details': groupDetails, 'patches': patches }
def getRequiredRedirects(page: pywikibot.Page) \ -> Tuple[Dict[str, RCatSet], bool]: """Compute ISO-4 redirects to `page` that we believe should exist. Returns `(req, skip)`, where: `req[redirectTitle] = redirectCategories`, `skip` indicates that we had to skip an infobox, so the result is most probably not exhaustive (so we won't report extra existing redirects). """ title = page.title() pageData = state.getPageData(title) result: DefaultDict[str, RCatSet] = defaultdict(lambda: RCatSet(0)) skip = False for infoboxId, infobox in enumerate(pageData['infoboxes']): altName = abbrevUtils.stripTitle(title) iTitle = abbrevUtils.sanitizeField(infobox.get('title', '')) name = iTitle or altName # On Wikipedia, we used to remove subtitles/dependent titles. # It seems not to change that much and it seems not doig that is better. # name = re.sub(r'(.{6})[-:–(].*', r'\1', name) # altName = re.sub(r'(.{6})[-:–(].*', r'\1', altName) iAbbrev = abbrevUtils.sanitizeField(infobox.get('abbreviation', '')) iAbbrevDotless = iAbbrev.replace('.', '') if iAbbrev == '' or iAbbrev == 'no': print(f'--Abbrev param empty or "no", ignoring [[{title}]].') skip = True continue if ':' in iAbbrev[:5]: print(f'--Abbrev contains early colon, ignoring [[{title}]].') reports.reportTitleWithColon( title, iTitle, iAbbrev) skip = True continue hasISO4Redirect = \ iAbbrev in pageData['redirects'] \ and isValidISO4Redirect(pageData['redirects'][iAbbrev], title, RCatSet.ISO4, strict=False) # If the abbreviation matches the computed one, # there should be a dotted and a dotless redirect. cLang = 'all' # abbrevUtils.getLanguage(infobox) cAbbrev = state.tryGetAbbrev(name, cLang) cAltAbbrev = state.tryGetAbbrev(altName, cLang) if cAbbrev is None or cAltAbbrev is None: skip = True continue if (not abbrevUtils.isSoftMatch(iAbbrev, cAbbrev) and not abbrevUtils.isSoftMatch(iAbbrev, cAltAbbrev)): print(f'--Abbreviations don\'t match, ignoring [[{title}]].') otherAbbrevs = list(state.getAllAbbrevs(name).values()) otherAbbrevs = [a for a in otherAbbrevs if abbrevUtils.isSoftMatch(iAbbrev, a)] if otherAbbrevs: reports.reportLanguageMismatch( title, iTitle, iAbbrev, cAbbrev, otherAbbrevs[0], abbrevUtils.sanitizeField(infobox.get('language', '')), abbrevUtils.sanitizeField(infobox.get('country', '')), cLang, state.getMatchingPatterns(name), hasISO4Redirect) patch = makeLanguageMismatchPatch( page, infoboxId, infobox.get('abbreviation'), cAbbrev, state.getMatchingPatterns(name) ) if patch is not None: patchset['patches'].append(patch) print(f'ADDED PATCH #{len(patchset["patches"])}!!!') with open('patchset.json', 'wt') as f: json.dump(patchset, f) else: reports.reportProperMismatch( title, iTitle, iAbbrev, cAbbrev, cLang, state.getMatchingPatterns(name), hasISO4Redirect) continue if iAbbrevDotless == iAbbrev: print(f'--Abbreviation is trivial (has no dots), ' f'to avoid confusion we\'re ignoring [[{title}]].') skip = True reports.reportTrivialAbbrev( title, iTitle, iAbbrev, pageData['redirects']) else: result[iAbbrev] |= RCatSet.ISO4 result[iAbbrevDotless] |= RCatSet.ISO4 for infobox in pageData['infoboxes']: nlm: Optional[str] = abbrevUtils.sanitizeField(infobox.get('nlm', '')) if nlm and re.fullmatch(r'[\w\ \.,\(\)\[\]\:\'/\-]+', nlm): result[nlm] |= RCatSet.NLM if not nlm: if infobox.get('issn'): nlm = issnToAbbrev['nlm'].get(infobox['issn']) if not nlm and infobox.get('eissn'): nlm = issnToAbbrev['nlm'].get(infobox['eissn']) if nlm and nlm == infobox.get('abbreviation').replace('.', ''): result[nlm] |= RCatSet.NLM msn: Optional[str] = \ abbrevUtils.sanitizeField(infobox.get('mathscinet', '')) if msn and re.fullmatch(r'[\w\ \.\(\)\:\'/\-]+', msn): result[msn] |= RCatSet.MSN result[msn.replace('.', '')] |= RCatSet.MSN if not msn: if infobox.get('issn'): msn = issnToAbbrev['mathscinet'].get(infobox['issn']) if not msn and infobox.get('eissn'): msn = issnToAbbrev['mathscinet'].get(infobox['eissn']) if msn and msn == iAbbrev: result[msn] |= RCatSet.MSN result[msn.replace('.', '')] |= RCatSet.MSN finalResult: Dict[str, RCatSet] = {} for rTitle, rCats in result.items(): if rCats: finalResult[rTitle] = rCats return finalResult, skip
def fixPageRedirects(page: pywikibot.Page) -> int: """Fix redirects to given page.""" title = page.title() pageData = state.getPageData(title) (requiredRedirects, skip) = getRequiredRedirects(page) nEditedPages = 0 for rTitle, rCats in requiredRedirects.items(): rNewContent = rcatSetToRedirectContent(title, rCats) # Attempt to create new redirect. if rTitle not in pageData['redirects']: try: exists = pywikibot.Page(Site(), rTitle).exists() except pywikibot.exceptions.InvalidTitle: exists = False if exists: print(f'--Skipping existing page [[{rTitle}]] ' f'(not a redirect to [[{title}]]).') if title == rTitle: continue if title not in pywikibot.Page(Site(), rTitle).text: reports.reportExistingOtherPage(title, rTitle) else: print(f'--Creating redirect ' f'from [[{rTitle}]] to [[{title}]]. ' f'Created content:\n{rNewContent}\n-----', flush=True) nEditedPages += 1 rPage = pywikibot.Page(Site(), rTitle) trySaving(rPage, rNewContent, 'Creating redirect from standard abbreviation. ', overwrite=False) else: rOldContent = pageData['redirects'][rTitle] if isValidISO4Redirect(rOldContent, title, rCats): print(f'--Skipping existing valid redirect ' f'from [[{rTitle}]] to [[{title}]].') elif isReplaceableRedirect(rOldContent, title, rCats | RCatSet.ISO4): # Don't log nor edit redirects that would be replaceable # except they have ISO4 and we're not sure it should have. if not (rCats & RCatSet.ISO4): continue print(f'--Replacing existing redirect ' f'from [[{rTitle}]] to [[{title}]].\n' f'RCatSet: {rCats}\n' f'Original content:\n{rOldContent}\n----- ' f'New content:\n{rNewContent}\n-----', flush=True) nEditedPages += 1 rPage = pywikibot.Page(Site(), rTitle) trySaving(rPage, rNewContent, 'Marking standard abbrev rcat. ', overwrite=True) elif not skip: print(f'--Skipping existing dubious redirect ' f'from [[{rTitle}]] to [[{title}]].\n' f'RCatSet: {rCats}\n' f'Original content:\n{rOldContent}\n----- ') reports.reportExistingOtherRedirect(title, rTitle, rOldContent) # Purge page cache to remove warnings about missing redirects. if nEditedPages > 0: tryPurging(page) # Report redirects that we wouldn't add, but exist and are marked as ISO-4. if requiredRedirects and not skip: expectedAbbrevs = \ [r.replace('.', '') for r in requiredRedirects] potentialAbbrevs = [] for rTitle, rContent in pageData['redirects'].items(): if 'from former name' in rContent or '.' not in rTitle: cAbbrevEng = state.tryGetAbbrev( abbrevUtils.stripTitle(rTitle), 'eng') or '' cAbbrevAll = state.tryGetAbbrev( abbrevUtils.stripTitle(rTitle), 'all') or '' cAbbrevEng = cAbbrevEng.replace('.', '') cAbbrevAll = cAbbrevAll.replace('.', '') if 'from former name' in rContent: if cAbbrevEng != rTitle.replace('.', ''): expectedAbbrevs.append(cAbbrevEng) if cAbbrevAll != rTitle.replace('.', ''): expectedAbbrevs.append(cAbbrevAll) elif '.' not in rTitle: if cAbbrevEng != rTitle.replace('.', ''): potentialAbbrevs.append((cAbbrevEng, rTitle)) if cAbbrevAll != rTitle.replace('.', ''): potentialAbbrevs.append((cAbbrevAll, rTitle)) expectedAbbrevs = [a for a in expectedAbbrevs if a] potentialAbbrevs = [(a, t) for (a, t) in potentialAbbrevs if a] for rTitle, rContent in pageData['redirects'].items(): if not re.search(r'R from ISO 4', rContent): continue # Ignore rTitle that contain a computed abbreviation as a # substring, assume that it's some valid variation on a subtitle. isExpected = False rTitleDotless = rTitle.replace('.', '') for computedAbbrev in expectedAbbrevs: if re.sub(r'\s*[:(].*', '', computedAbbrev) in rTitleDotless: isExpected = True break if not isExpected: # Find other titles in existing redirects # that would ISO-4 abbreviate to it potentials = [t for (a, t) in potentialAbbrevs if abbrevUtils.isSoftMatch(rTitleDotless, a)] potentials = list(sorted(set(potentials))) # Find closest computed abbrev. bestAbbrev = '' bestDist = len(rTitle) for computedAbbrev in sorted(requiredRedirects): dist = Levenshtein.distance(rTitle, computedAbbrev) if dist < bestDist: bestDist = dist bestAbbrev = computedAbbrev # Skip if closest abbrev. is far (assume it's from a former # title, since there's a ton of cases like that). if bestDist <= 8: reports.reportSuperfluousRedirect( title, rTitle, rContent, bestAbbrev, potentials) return nEditedPages