def find_sub_templates( lookingfor: str, page: Page, wholeword: bool, matchcase: bool ): found_templates = [] if page.isRedirectPage(): page = page.getRedirectTarget() pagetext = page.text if not matchcase: pagetext = pagetext.lower() lookingfor = lookingfor.lower() if wholeword: pattern = re.compile(r'\b' + re.escape(lookingfor) + r'\b') if pattern.search(pagetext): found_templates.append(page) elif lookingfor in pagetext: found_templates.append(page) for sub_template in page.templates(content=True): if sub_template.isRedirectPage(): sub_template = sub_template.getRedirectTarget() text = sub_template.text if matchcase else sub_template.text.lower() if wholeword: # noinspection PyUnboundLocalVariable if pattern.search(text): found_templates.append(sub_template) elif lookingfor in text: found_templates.append(sub_template) # Remove duplicate templates return {f.title(): f for f in found_templates}.values()
def process(day): """ one day bot processing arguments: day -- python date format """ if params.verbose: print("processing Journal des recréations ({day})".format( day=format_date(day))) start = to_date(day) end = to_date(day + ONE_DAY) result = "\n\n== {} ==\n".format(format_date(day)) comment = [] for i, page in enumerate(creation_log(start, end), 1): gras = '' date = '' if params.verbose: print(i, page["timestamp"]) dl = deletelog(page["title"]) if dl: page_pas = Page(Site(), "Discussion:" + page["title"] + "/Suppression") if page_pas.isRedirectPage(): page_pas = page_pas.getRedirectTarget() if page_pas.exists() and re.search(r'article supprimé', page_pas.get(), re.I): if re.search( r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I): date = u' de %s' % re.search( r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I).group(1) comment.append(u'[[%s]] (malgré [[%s|PàS]]%s)' % (page["title"], page_pas.title(), date)) gras = "'''" r = ( u"* {g}{{{{a-court|{title}}}}} <small>([[{pas}|PàS]])</small> supprimé le {date} puis recréé par {{{{u|{user}}}}}{g} \n" .format(title=wiki_param(page["title"]), pas=page_pas.title(), user=wiki_param(page["user"]), date=format_date(from_date(dl["timestamp"])), g=gras)) if params.verbose: print(r) result += r page = Page(Site(), params.prefix + u'/' + format_date(day, skip_day=True)) try: result = page.get() + result except NoPage: result = u'{{mise à jour bot|Zérobot}}' + result if comment: comment.insert(0, '') page.put( result, comment="Journal des recréations ({day}) ".format(day=format_date(day)) + ' - '.join(comment))
def isPeople(self, article): site = Site("en") page = Page(site, article.decode("utf8")) #print article #print page.get() #print page.get(get_redirect = True) #print "redirect?", page.isRedirectPage() if page.isRedirectPage(): page = page.getRedirectTarget() #print [cat.title() for cat in page.categories()] cats = {cat:1 for cat in page.categories()} if any(["People" in tcat.title() for tcat in cats]): print(cats) return True currcats = cats.copy() allcats = {} depth = 0 while currcats!={} and depth < 2: depth += 1 newcats = {} for cat in currcats: if cat in allcats: continue allcats[cat] = 1 parentcats = {cat:1 for cat in cat.categories()} if any(["People" in tcat.title() for tcat in parentcats]): print(parentcats) return True for parcat in parentcats: if parcat not in allcats and parcat not in newcats: newcats[parcat] = 1 currcats = newcats print(len(currcats), currcats) #self.historicCats.update(allcats) return False
def skip_page(self, page: pywikibot.Page) -> bool: """Skip special/media pages""" if page.namespace() < 0: return True elif not page.exists(): return True elif page.isRedirectPage(): return True return super().skip_page(page)
def load_config(page: pywikibot.Page, **kwargs: Any) -> ConfigJSONObject: """Load JSON config from the page.""" if page.isRedirectPage(): pywikibot.log(f"{page!r} is a redirect.") page = page.getRedirectTarget() _empty = jsoncfg.loads_config("{}") if not page.exists(): pywikibot.log(f"{page!r} does not exist.") return _empty try: return jsoncfg.loads_config(page.get(**kwargs).strip()) except pywikibot.exceptions.PageRelatedError: return _empty
def get_wikidata_id(self, page: pywikibot.Page): if not page.exists(): return None # T256583, T87345 page.get(get_redirect=True) if page.isRedirectPage(): page = page.getRedirectTarget() page.get() item = pywikibot.ItemPage.fromPage(page) if not item or not item.exists(): return None return item.title()
def extract_coach_tenures(name): """ Extract a coaches tenures from Wikipedia. Arguments: - name (name of coach) Returns: - list(dict) """ logging.info('Looking for coach %s' % name) page_name = get_page_name_from_coach_name_wiki(name) # If we can't find a wikipedia page, return immediately if not page_name: return [] else: logging.debug('Looking up %s as http://en.wikipedia.org/wiki/%s' % (name, page_name)) # Extract page content from wikipedia and narrow it down to the templates p = Page(Site('en', 'wikipedia'), page_name) if p.isRedirectPage(): p = p.getRedirectTarget() content = p.get() parsed = mwparserfromhell.parse(content) templates = parsed.filter_templates() # Extract teams and years from the template teams, years = None, None for t in templates: for p in t.params: if "coach_teams" in p.name: teams = parse_coach_teams_and_positions_from_wiki(p) if "coach_years" in p.name: years = parse_coach_years_from_wiki(p) # If we were not able to extract information from the page, log & return empty if not teams or not years: logging.warning( 'ISSUE DETECTED: %s is valid page but no information extracted' % name) return [] tenures = [dict(t[0].items() + t[1].items()) for t in zip(teams, years)] [d.update({'name': name}) for d in tenures] return tenures
def process(day): """ one day bot processing arguments: day -- python date format """ if params.verbose: print("processing Journal des recréations ({day})".format(day=format_date(day))) start = to_date(day) end = to_date(day+ONE_DAY) result = "\n\n== {} ==\n".format(format_date(day)) comment = [] for i,page in enumerate(creation_log(start,end),1): gras = '' date = '' if params.verbose: print (i,page["timestamp"]) dl = deletelog(page["title"]) if dl: page_pas = Page(Site(), "Discussion:" + page["title"] + "/Suppression") if page_pas.isRedirectPage(): page_pas = page_pas.getRedirectTarget() if page_pas.exists() and re.search(r'article supprimé', page_pas.get(), re.I): if re.search(r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I): date = u' de %s' % re.search(r'\{\{ ?article supprimé[^\}]*\d{1,2} (\S* \d{4}) à', page_pas.get(), re.I).group(1) comment.append(u'[[%s]] (malgré [[%s|PàS]]%s)' % (page["title"], page_pas.title(), date)) gras = "'''" r = (u"* {g}{{{{a-court|{title}}}}} <small>([[{pas}|PàS]])</small> supprimé le {date} puis recréé par {{{{u|{user}}}}}{g} \n" .format(title = wiki_param(page["title"]), pas = page_pas.title(), user = wiki_param(page["user"]), date = format_date(from_date(dl["timestamp"])), g = gras)) if params.verbose: print(r) result += r page = Page(Site(), params.prefix + u'/' + format_date(day, skip_day=True)) try: result = page.get() + result except NoPage: result = u'{{mise à jour bot|Zérobot}}' + result if comment: comment.insert(0, '') page.put(result,comment="Journal des recréations ({day}) ".format(day=format_date(day)) + ' - '.join(comment))
def getCategories(self, article): baseDir = "articleCategoriesCache/" if not os.path.exists(baseDir): os.makedirs(baseDir) fname = baseDir + article if os.path.isfile(fname): lines = [] try: with codecs.open(fname, encoding='utf-8') as f: lines = [line.strip() for line in f.readlines()] #print "utf8 encoding" except: with codecs.open(fname) as f: lines = [line.strip() for line in f.readlines()] #print "ascii encoding" lines = self.filterCategories(lines) if lines != []: #print "get Cat Cache:", lines return lines site = Site("en") page = Page(site, article.decode("utf8")) #print article #print page.get() #print page.get(get_redirect = True) #print "redirect?", page.isRedirectPage() if page.isRedirectPage(): page = page.getRedirectTarget() #print [cat.title() for cat in page.categories()] cats = sorted([ cat.title() for cat in page.categories() if not cat.isHiddenCategory() ]) #print "downloaded cats1: ", cats cats = self.filterCategories(cats) #print "downloaded cats2: ", cats text = "" for cat in cats: text += cat + "\n" try: with codecs.open(fname, "a+") as f: f.write(text) except: with codecs.open(fname, "a+") as f: f.write(text.encode('utf-8')) return cats