Exemplo n.º 1
0
	def process_pages(self):
		for page in self.pages:
			if bot.donenow("User:Theo's Little Bot/disable/empty citations",donenow=self.donenow,donenow_div=5) == True:
				contents = page.edit()
				new_contents = re.sub(r"""{{(citation|cite)}}(\.*)""", r"""\2{{citation needed|{{subst:DATE}}}}""", contents, flags=re.UNICODE)
				page.save(new_contents,summary="Converting empty {{[[Template:Citation|citation]]}} to {{[[Template:citation needed|citation needed]]}} ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/empty citations|disable]])")
				self.donenow += 1
			else:
				sys.exit()
Exemplo n.º 2
0
def process_page(page):
	"""Given an image object, gets its uploader and
	its upload date, fills in {{Information}} for it,
	and saves the new page.
	"""
	revision =  page.revisions(dir='newer').next()

	user = revision['user']

	date = get_exif_date(page)
	if date == None:
		date = time.strftime("%d %B %Y",revision['timestamp'])

	contents = page.edit()

	if contents != "":
		description = contents.strip()
		desc_code = mwparserfromhell.parse(description)
		for bad_code in desc_code.ifilter_templates(): # Remove templates
			description = description.replace(unicode(bad_code),'')
		for bad_code in desc_code.ifilter_headings(): # Remove headers
			description = description.replace(unicode(bad_code),'')
		if description.find('<nowiki') != -1:
			return # Skip complex descriptions
		description = description.replace('|','{{!}}') # Escape pipe symbols
		description = re.sub(r"""[ ]{2,}"""," ",description,flags=re.U) # Remove excessive spaces
		description = re.sub(r"""\[\[(?:File|Image):(.*?)(?:\|.*?)\]\]""",r"[[:File:\1]]",description,flags=re.U) # Turn images into links
		description = re.sub(r"""\[\[User:.*?\]\] \(\[\[User talk:J.*?\]\]\).*?\(UTC\)""",'',description,flags=re.U) # Remove signatures when possible
	else:
		description = ""

	contents = u"""{{Information
| description = """+description+"""
| source      = {{own}}
| date        = """ + unicode(date) + """
| author      = {{subst:usernameexpand|""" + user.replace(" ","_") + """}}
}}\n""" + contents

	global donenow
	if bot.donenow("User:Theo's Little Bot/disable/selfimages",donenow=donenow,donenow_div=5,shutdown=100) == True:
		# adding the template
		page.save(contents,"[[WP:BOT|Bot]]: Automatically adding {{[[Template:Information|Information]]}} to self-published work) ([[User:Theo's Little Bot/disable/selfimages|disable]]")
		donenow += 1
		# notifying the uploader
		usertalktitle = "User talk:"+user
		if bot.nobots(usertalktitle,user="******",task='selfimages') == True:
			usertalk = site.Pages[usertalktitle]
			notification = "\n\n== Notification of automated file description generation ==\n{{subst:Un-botfill|file="+page.page_title+"|sig=~~~~}}"
			usertalk.save(appendtext=notification,"[[WP:BOT|Bot]]: Notifying user about autogenerated {{[[Template:Information|Information]]}} addition) ([[User:Theo's Little Bot/disable/selfimages|disable]]",redirect=True)
	else:
		sys.exit()
Exemplo n.º 3
0
 def process_page(self, page):
     if bot.donenow("User:Theo's Little Bot/disable/archiveurl", donenow=self.donenow, donenow_div=5) == True:
         print "Processing " + page.encode("ascii", "replace")
         page = site.Pages[page]
         text = page.edit()
         wikicode = mwparserfromhell.parse(text)
         for template in wikicode.filter_templates():
             if "cite web" in template.name and template.has_param("url") == False:
                 archiveurl = None
                 for param in template.params:
                     items = param.strip().split("=")
                     if items[0] == "url":
                         continue
                     if items[0] == "archiveurl":
                         archiveurl = items[0]
                 if archiveurl is not None:
                     if re.search(r"web\.archive\.org", unicode(template), flags=re.U) != None:
                         try:
                             new_url = re.search(
                                 r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)web.archive.org/web/\d*/(.*?)(?:\||}})",
                                 unicode(template),
                                 flags=re.UNICODE | re.M,
                             ).groups(0)[0]
                         except AttributeError:
                             try:
                                 new_url = re.search(
                                     r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)pandora.nla.gov.au/nph-wb/\d*/(.*?)(?:\||}})",
                                     unicode(template),
                                     flags=re.UNICODE | re.M,
                                 ).groups(0)[0]
                             except AttributeError:
                                 print "I don't recognize the archive structure, sadly. Skipping."
                                 continue
                         if re.search(r"(http|https)://", new_url.strip()) == None:
                             new_url = "http://" + new_url
                         template.add("url", new_url.strip())
                         print "Added url parameter to {{cite web}} template."
                 else:
                     continue
         text = unicode(wikicode)
         try:
             page.save(
                 text,
                 summary="Fixing references: adding url parameter ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/archiveurl|disable]])",
             )
             self.donenow += 1  # we only count it as "done" when we make a change
         except:
             print "Unable to save page; skipping."
Exemplo n.º 4
0
def main():
	global site
	site = mwclient.Site('en.wikipedia.org')
	site.login(password.username, password.password)
	donenow = 0

	# category = [site.Pages['File:TheoBotTestImage.png']] - testing only
	category = mwclient.listing.Category(site, 'Category:Non-free images for NFUR review')
	for page in category:
		image = NFURPage(page)
		try:
			if image.final_contents and bot.donenow("User:Theo's Little Bot/disable/nfur",donenow=donenow,donenow_div=5,shutdown=40) == True:
			 	page.save(image.final_contents,summary="[[WP:BOT|Bot]] on trial: Adding autogenerated FUR rationale - feel free to expand!) ([[User:Theo's Little Bot/disable/nfur|disable]]")
			 	donenow += 1
		except AttributeError:
			continue
Exemplo n.º 5
0
	def __init__(self):
		"""We're ready!"""
		self.donenow = 0
		self.get_current_articles()
		associated_wikiprojects = {}
		for article in self.articles:
			associated_wikiprojects[article] = self.process_article(article)

		for article,wikiprojects in associated_wikiprojects.items():
			for project in wikiprojects:
				if bot.donenow("User:Theo's Little Bot/disable/tafi",donenow=self.donenow,donenow_div=5) == True:
					self.notify_wikiproject(project,article)
					self.donenow += 1
				else:
					print "Bot was disabled."
					sys.exit()
Exemplo n.º 6
0
def main():
	global site
	site = mwclient.Site('en.wikipedia.org')
	site.login(password.username, password.password)

	print "And we're live."
	connection = MySQLdb.connect(
		host = 'enwiki.labsdb',
		db = 'enwiki_p',
		read_default_file = '~/replica.my.cnf'
	)

	# The script runs in 500 article increments.
	# In other words, in each run, it will process
	# and fix 500 articles and then stop.
	# !todo figure out how long a run takes vs replag
	# and then optimize crontab
	cursor = connection.cursor()
	query = """\
	SELECT page_title
	FROM externallinks
	JOIN page
	ON page_id = el_from
	WHERE el_to LIKE "%&utm_%=%"
	AND page_namespace = 0
	LIMIT 5000;
	"""
	cursor.execute(query)

	donenow = 0
	for title in cursor.fetchall():
		title = title[0].decode("utf-8") # since tuples are returned
		if bot.donenow("User:Theo's Little Bot/disable/tracking",donenow=donenow,donenow_div=5) == True:
			if bot.nobots(page=title,task='tracking') == True:
				if process(site.Pages[title]) == True:
					donenow += 1
				else:
					print "No changes to make."
			else:
				print "Bot was denied, boo hoo."
			
		else:
			print "Bot was disabled...shutting down..."
			sys.exit()
Exemplo n.º 7
0
	results = mwclient.listing.List(site=site,list_name='exturlusage',prefix='eu',euquery="*."+link,euprop='title')
	for item in results:
		full_results.append(item)

print "Generating list of mainspace pages linking to {0}...".format(naughty_links)

for page in full_results:
	if page[u'ns'] == 0 and page[u'title'] not in to_process:
		print("{0}".format(page[u'title']).encode('UTF-8'))
		to_process.append(site.Pages[page[u'title']])

print "Processing mainspace pages linking to {0}...".format(naughty_links)

donenow = 0
for page in to_process:
	if bot.donenow("User:Theo's Little Bot/disable/external links",donenow=donenow,donenow_div=5) == True:
		print "~~~\n~~~\n"
		contents = page.edit()
		for link in naughty_links:
			try:
				refname = re.search(r"<ref(?:\s*name\s*=\s*(.*?))>[^<]*?" + link.replace('.',r'\.') + r".*?</ref>",contents,flags=re.UNICODE).groups()[0]
				print refname
				contents = re.sub(r"<ref(?:\s*name\s*=\s*(.*?))>[^<]*?" + link.replace('.',r'\.') + r".*?</ref>", '', contents, flags=re.UNICODE | re.DOTALL)
				contents = re.sub(r"<ref.*?" + refname.replace('"',r'') + r".*?>", "", contents, flags=re.UNICODE)
			except AttributeError:
				pass	
			contents = re.sub(r"\*(?!.*?<ref).*?" + link.replace('\.',r'\.') + r".*", '', contents, flags=re.UNICODE)
			contents = re.sub(r"\[.*" + link.replace('.',r'\.') + r".*]", '', contents, flags=re.UNICODE)
		page.save(contents,summary=summary)
		donenow += 1
	else:
Exemplo n.º 8
0
	def run(self):
		category = mwclient.listing.Category(site, 'Category:All articles with dead external links')
		# category = [site.Pages['10 Hronia Mazi']] - debugging only
		for page in category:
			dead_refs = []
			print page.page_title
			orig_contents = page.edit()
			contents = page.edit()
			number_done = 0
			all_refs = re.findall(r"""<ref[^>]*>.*?</ref>""",contents,flags=re.UNICODE | re.IGNORECASE)
			for ref in all_refs:
				ref_lower = ref.lower()
				if  any(name in ref_lower for name in ['{{'+name for name in self.deadlink_names]):
					dead_refs.append(ref)
			for ref in dead_refs:
				ref_code = mwparserfromhell.parse(ref)
				updated = False
				for template in ref_code.filter_templates():
					if "cite web" in template.name and template.has_param('archiveurl') == False:
						url = unicode(template.get('url').value.strip())
						try: 
							if url.find('web.archive.org') != -1:
								okay_to_edit = False
								print "The url is already an archive link!"
							elif requests.get(url, timeout=15).status_code != requests.codes.ok:
								okay_to_edit = True
							elif requests.get(url, timeout=15).status_code == requests.codes.ok:
								okay_to_edit = False
								print "No need to add an archive, since the citations's URL currently works!"
						except:
							okay_to_edit = True
						if okay_to_edit == True:
							if template.has_param('accessdate'):
								try:
									accessdate = parser.parse(str(template.get('accessdate').value))
									wayback_date = accessdate.strftime("%Y%m%d%H%M%S")
									r = requests.get("http://web.archive.org/web/{date}/{url}".format(date=wayback_date,url=url)) 
								except ValueError: # in case we can't parse the accessdate
									r = requests.get("http://web.archive.org/web/form-submit.jsp", params={'url':url, 'type':'replay'})
							else:
								r = requests.get("http://web.archive.org/web/form-submit.jsp", params={'url':url, 'type':'replay'})
							print r.url
							print r.status_code
							if r.status_code == requests.codes.ok:
								number_done += 1
								updated = True
								wayback_url = r.url
								try:
									wayback_date_object = datetime.strptime(wayback_url.split('/')[4],"%Y%m%d%H%M%S")
									wayback_date = wayback_date_object.strftime('%d %B %Y')
									template.add('archivedate',wayback_date)
								except ValueError:
									print "Unable to fetch date...no worries, we have exception handing!"
								template.add('archiveurl',wayback_url)
							else:
								print "{url} not archived in wayback machine.".format(url=url)
								continue # this url was not archived by the wayback machine; nothing we can do here.
						else:
							print "Not adding archive link, per above."
				for template in ref_code.filter_templates():
					nameoftemp = template.name.lower()
					if any(name in nameoftemp for name in self.deadlink_names) and updated == True:
						ref_code.remove(template)
				if updated == True:
					new_ref = unicode(ref_code)
					contents = re.sub(re.escape(ref),new_ref,contents,flags=re.U)
				else:
					pass
			if self.DRYRUN == False and number_done > 0:
				if bot.donenow("User:Theo's Little Bot/disable/deadlinks",donenow=self.donenow,donenow_div=5) == True:
					if bot.nobots(page=page.page_title) == True:
						try:
							page.save(contents,summary="Adding archiveurl for {0} dead link{1} ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/deadlinks|disable]])".format(number_done,'s' if number_done > 1 else ''))
							print "{0} saved!".format(page.page_title)
							self.donenow += 1
						except mwclient.errors.EditError as e:
							print "ERROR - unable to save page: ", e
					else:
						print "Could not save page...bot not authorized."
				else:
					print "Bot was disabled."
					sys.exit()
			elif self.DRYRUN == True and self.VERBOSE == True:
				diff = difflib.unified_diff(orig_contents.splitlines(), contents.splitlines())
				print '\n'.join(diff)