def process_pages(self): for page in self.pages: if bot.donenow("User:Theo's Little Bot/disable/empty citations",donenow=self.donenow,donenow_div=5) == True: contents = page.edit() new_contents = re.sub(r"""{{(citation|cite)}}(\.*)""", r"""\2{{citation needed|{{subst:DATE}}}}""", contents, flags=re.UNICODE) page.save(new_contents,summary="Converting empty {{[[Template:Citation|citation]]}} to {{[[Template:citation needed|citation needed]]}} ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/empty citations|disable]])") self.donenow += 1 else: sys.exit()
def process_page(page): """Given an image object, gets its uploader and its upload date, fills in {{Information}} for it, and saves the new page. """ revision = page.revisions(dir='newer').next() user = revision['user'] date = get_exif_date(page) if date == None: date = time.strftime("%d %B %Y",revision['timestamp']) contents = page.edit() if contents != "": description = contents.strip() desc_code = mwparserfromhell.parse(description) for bad_code in desc_code.ifilter_templates(): # Remove templates description = description.replace(unicode(bad_code),'') for bad_code in desc_code.ifilter_headings(): # Remove headers description = description.replace(unicode(bad_code),'') if description.find('<nowiki') != -1: return # Skip complex descriptions description = description.replace('|','{{!}}') # Escape pipe symbols description = re.sub(r"""[ ]{2,}"""," ",description,flags=re.U) # Remove excessive spaces description = re.sub(r"""\[\[(?:File|Image):(.*?)(?:\|.*?)\]\]""",r"[[:File:\1]]",description,flags=re.U) # Turn images into links description = re.sub(r"""\[\[User:.*?\]\] \(\[\[User talk:J.*?\]\]\).*?\(UTC\)""",'',description,flags=re.U) # Remove signatures when possible else: description = "" contents = u"""{{Information | description = """+description+""" | source = {{own}} | date = """ + unicode(date) + """ | author = {{subst:usernameexpand|""" + user.replace(" ","_") + """}} }}\n""" + contents global donenow if bot.donenow("User:Theo's Little Bot/disable/selfimages",donenow=donenow,donenow_div=5,shutdown=100) == True: # adding the template page.save(contents,"[[WP:BOT|Bot]]: Automatically adding {{[[Template:Information|Information]]}} to self-published work) ([[User:Theo's Little Bot/disable/selfimages|disable]]") donenow += 1 # notifying the uploader usertalktitle = "User talk:"+user if bot.nobots(usertalktitle,user="******",task='selfimages') == True: usertalk = site.Pages[usertalktitle] notification = "\n\n== Notification of automated file description generation ==\n{{subst:Un-botfill|file="+page.page_title+"|sig=~~~~}}" usertalk.save(appendtext=notification,"[[WP:BOT|Bot]]: Notifying user about autogenerated {{[[Template:Information|Information]]}} addition) ([[User:Theo's Little Bot/disable/selfimages|disable]]",redirect=True) else: sys.exit()
def process_page(self, page): if bot.donenow("User:Theo's Little Bot/disable/archiveurl", donenow=self.donenow, donenow_div=5) == True: print "Processing " + page.encode("ascii", "replace") page = site.Pages[page] text = page.edit() wikicode = mwparserfromhell.parse(text) for template in wikicode.filter_templates(): if "cite web" in template.name and template.has_param("url") == False: archiveurl = None for param in template.params: items = param.strip().split("=") if items[0] == "url": continue if items[0] == "archiveurl": archiveurl = items[0] if archiveurl is not None: if re.search(r"web\.archive\.org", unicode(template), flags=re.U) != None: try: new_url = re.search( r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)web.archive.org/web/\d*/(.*?)(?:\||}})", unicode(template), flags=re.UNICODE | re.M, ).groups(0)[0] except AttributeError: try: new_url = re.search( r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)pandora.nla.gov.au/nph-wb/\d*/(.*?)(?:\||}})", unicode(template), flags=re.UNICODE | re.M, ).groups(0)[0] except AttributeError: print "I don't recognize the archive structure, sadly. Skipping." continue if re.search(r"(http|https)://", new_url.strip()) == None: new_url = "http://" + new_url template.add("url", new_url.strip()) print "Added url parameter to {{cite web}} template." else: continue text = unicode(wikicode) try: page.save( text, summary="Fixing references: adding url parameter ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/archiveurl|disable]])", ) self.donenow += 1 # we only count it as "done" when we make a change except: print "Unable to save page; skipping."
def main(): global site site = mwclient.Site('en.wikipedia.org') site.login(password.username, password.password) donenow = 0 # category = [site.Pages['File:TheoBotTestImage.png']] - testing only category = mwclient.listing.Category(site, 'Category:Non-free images for NFUR review') for page in category: image = NFURPage(page) try: if image.final_contents and bot.donenow("User:Theo's Little Bot/disable/nfur",donenow=donenow,donenow_div=5,shutdown=40) == True: page.save(image.final_contents,summary="[[WP:BOT|Bot]] on trial: Adding autogenerated FUR rationale - feel free to expand!) ([[User:Theo's Little Bot/disable/nfur|disable]]") donenow += 1 except AttributeError: continue
def __init__(self): """We're ready!""" self.donenow = 0 self.get_current_articles() associated_wikiprojects = {} for article in self.articles: associated_wikiprojects[article] = self.process_article(article) for article,wikiprojects in associated_wikiprojects.items(): for project in wikiprojects: if bot.donenow("User:Theo's Little Bot/disable/tafi",donenow=self.donenow,donenow_div=5) == True: self.notify_wikiproject(project,article) self.donenow += 1 else: print "Bot was disabled." sys.exit()
def main(): global site site = mwclient.Site('en.wikipedia.org') site.login(password.username, password.password) print "And we're live." connection = MySQLdb.connect( host = 'enwiki.labsdb', db = 'enwiki_p', read_default_file = '~/replica.my.cnf' ) # The script runs in 500 article increments. # In other words, in each run, it will process # and fix 500 articles and then stop. # !todo figure out how long a run takes vs replag # and then optimize crontab cursor = connection.cursor() query = """\ SELECT page_title FROM externallinks JOIN page ON page_id = el_from WHERE el_to LIKE "%&utm_%=%" AND page_namespace = 0 LIMIT 5000; """ cursor.execute(query) donenow = 0 for title in cursor.fetchall(): title = title[0].decode("utf-8") # since tuples are returned if bot.donenow("User:Theo's Little Bot/disable/tracking",donenow=donenow,donenow_div=5) == True: if bot.nobots(page=title,task='tracking') == True: if process(site.Pages[title]) == True: donenow += 1 else: print "No changes to make." else: print "Bot was denied, boo hoo." else: print "Bot was disabled...shutting down..." sys.exit()
results = mwclient.listing.List(site=site,list_name='exturlusage',prefix='eu',euquery="*."+link,euprop='title') for item in results: full_results.append(item) print "Generating list of mainspace pages linking to {0}...".format(naughty_links) for page in full_results: if page[u'ns'] == 0 and page[u'title'] not in to_process: print("{0}".format(page[u'title']).encode('UTF-8')) to_process.append(site.Pages[page[u'title']]) print "Processing mainspace pages linking to {0}...".format(naughty_links) donenow = 0 for page in to_process: if bot.donenow("User:Theo's Little Bot/disable/external links",donenow=donenow,donenow_div=5) == True: print "~~~\n~~~\n" contents = page.edit() for link in naughty_links: try: refname = re.search(r"<ref(?:\s*name\s*=\s*(.*?))>[^<]*?" + link.replace('.',r'\.') + r".*?</ref>",contents,flags=re.UNICODE).groups()[0] print refname contents = re.sub(r"<ref(?:\s*name\s*=\s*(.*?))>[^<]*?" + link.replace('.',r'\.') + r".*?</ref>", '', contents, flags=re.UNICODE | re.DOTALL) contents = re.sub(r"<ref.*?" + refname.replace('"',r'') + r".*?>", "", contents, flags=re.UNICODE) except AttributeError: pass contents = re.sub(r"\*(?!.*?<ref).*?" + link.replace('\.',r'\.') + r".*", '', contents, flags=re.UNICODE) contents = re.sub(r"\[.*" + link.replace('.',r'\.') + r".*]", '', contents, flags=re.UNICODE) page.save(contents,summary=summary) donenow += 1 else:
def run(self): category = mwclient.listing.Category(site, 'Category:All articles with dead external links') # category = [site.Pages['10 Hronia Mazi']] - debugging only for page in category: dead_refs = [] print page.page_title orig_contents = page.edit() contents = page.edit() number_done = 0 all_refs = re.findall(r"""<ref[^>]*>.*?</ref>""",contents,flags=re.UNICODE | re.IGNORECASE) for ref in all_refs: ref_lower = ref.lower() if any(name in ref_lower for name in ['{{'+name for name in self.deadlink_names]): dead_refs.append(ref) for ref in dead_refs: ref_code = mwparserfromhell.parse(ref) updated = False for template in ref_code.filter_templates(): if "cite web" in template.name and template.has_param('archiveurl') == False: url = unicode(template.get('url').value.strip()) try: if url.find('web.archive.org') != -1: okay_to_edit = False print "The url is already an archive link!" elif requests.get(url, timeout=15).status_code != requests.codes.ok: okay_to_edit = True elif requests.get(url, timeout=15).status_code == requests.codes.ok: okay_to_edit = False print "No need to add an archive, since the citations's URL currently works!" except: okay_to_edit = True if okay_to_edit == True: if template.has_param('accessdate'): try: accessdate = parser.parse(str(template.get('accessdate').value)) wayback_date = accessdate.strftime("%Y%m%d%H%M%S") r = requests.get("http://web.archive.org/web/{date}/{url}".format(date=wayback_date,url=url)) except ValueError: # in case we can't parse the accessdate r = requests.get("http://web.archive.org/web/form-submit.jsp", params={'url':url, 'type':'replay'}) else: r = requests.get("http://web.archive.org/web/form-submit.jsp", params={'url':url, 'type':'replay'}) print r.url print r.status_code if r.status_code == requests.codes.ok: number_done += 1 updated = True wayback_url = r.url try: wayback_date_object = datetime.strptime(wayback_url.split('/')[4],"%Y%m%d%H%M%S") wayback_date = wayback_date_object.strftime('%d %B %Y') template.add('archivedate',wayback_date) except ValueError: print "Unable to fetch date...no worries, we have exception handing!" template.add('archiveurl',wayback_url) else: print "{url} not archived in wayback machine.".format(url=url) continue # this url was not archived by the wayback machine; nothing we can do here. else: print "Not adding archive link, per above." for template in ref_code.filter_templates(): nameoftemp = template.name.lower() if any(name in nameoftemp for name in self.deadlink_names) and updated == True: ref_code.remove(template) if updated == True: new_ref = unicode(ref_code) contents = re.sub(re.escape(ref),new_ref,contents,flags=re.U) else: pass if self.DRYRUN == False and number_done > 0: if bot.donenow("User:Theo's Little Bot/disable/deadlinks",donenow=self.donenow,donenow_div=5) == True: if bot.nobots(page=page.page_title) == True: try: page.save(contents,summary="Adding archiveurl for {0} dead link{1} ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/deadlinks|disable]])".format(number_done,'s' if number_done > 1 else '')) print "{0} saved!".format(page.page_title) self.donenow += 1 except mwclient.errors.EditError as e: print "ERROR - unable to save page: ", e else: print "Could not save page...bot not authorized." else: print "Bot was disabled." sys.exit() elif self.DRYRUN == True and self.VERBOSE == True: diff = difflib.unified_diff(orig_contents.splitlines(), contents.splitlines()) print '\n'.join(diff)