def main(): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') q = ('select page_title from page where page_namespace = 0 ' 'and page_is_redirect = 0 and page_id not in ' '(select page_id from page join page_props on pp_page = page_id ' 'where page_namespace = 0 and pp_propname = "wikibase_item") ' 'order by page_id;') no_wikidata = [x[0].decode('utf-8') for x in wptools.query('wiki', q, None)] total_count = len(no_wikidata) # Capturing this before truncating list no_wikidata = no_wikidata[:100] page = pywikibot.Page(bot, 'User:Reports_bot/No_Wikidata_item') content = "'''Total Articles Missing From Wikidata:''' " + str(total_count) + "\n\n" for title in no_wikidata: content += "* [[" + title.replace('_', ' ') + \ "]] ([https://www.wikidata.org/w/index.php?search=" + \ quote(title) + \ "&title=Special%3ASearch&fulltext=1 Search on Wikidata])\n" page.text = content page.save("Updating list", minor=False, quiet=True)
def prepare(self, saveto): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') # Retrieve list of WikiProjects projects = [] for row in wptools.query('index', 'select distinct pi_project from projectindex;', None): projects.append(row[0]) runtime = datetime.datetime.utcnow().strftime('%H:%M, %d %B %Y (UTC)') q = ('select distinct page.page_title from page ' 'join categorylinks on page.page_id = categorylinks.cl_from ' 'left join redirect on page.page_id = redirect.rd_from ' 'where page_namespace = 4 ' 'and page_title not like "%/%" ' 'and rd_title is null ' 'and (cl_to in ' '(select page.page_title from page ' 'where page_namespace = 14 and ' 'page_title like "%\_WikiProjects" ' 'and page_title not like "%\_for\_WikiProjects" ' 'and page_title not like "%\_of\_WikiProjects") ' 'or page_title like "WikiProject\_%");') formaldefinition = wptools.query('wiki', q, None) # http://quarry.wmflabs.org/query/3509 for row in formaldefinition: row = "Wikipedia:" + row[0].decode('utf-8') if row not in projects: projects.append(row) projects.sort() packages = [projects[i:i+50] for i in range(0, len(projects), 50)] report = {} for package in packages: url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=info&inprop=watchers&titles=" for title in package: url += quote(title) + "|" url = url[:-1] # Truncate trailing pipe apiquery = requests.get(url) apiquery = apiquery.json() for pagedata in apiquery['query']['pages'].values(): if 'watchers' in pagedata: if pagedata['watchers'] > 29: # Required part report[pagedata['title']] = pagedata['watchers'] report = sorted(report.items(), key=operator.itemgetter(1), reverse=True) contents = 'List of WikiProjects by number of watchers of its main page and talk page. A WikiProject not appearing on this list has fewer than 30 watchers. Data as of <onlyinclude>' + runtime + '</onlyinclude>' contents += '\n\n{| class="wikitable sortable plainlinks"\n|-\n! No.\n! WikiProject\n! Watchers\n' counter = 0 for pair in report: counter += 1 contents += "|-\n| {0}\n| [[{1}]]\n| {2}\n".format(str(counter), pair[0], pair[1]) contents += "|}" page = pywikibot.Page(bot, saveto) page.text = contents page.save("Updating report", minor=False, quiet=True)
def main(): wptools = WikiProjectTools() bot = pywikibot.Site("en", "wikipedia") q = ( "select page_title from page where page_namespace = 0 " "and page_is_redirect = 0 and page_title not in " "(select page_title from page join page_props on pp_page = page_id " 'where page_namespace = 0 and pp_propname = "wikibase_item") ' "order by page_id;" ) no_wikidata = [x[0].decode("utf-8") for x in wptools.query("wiki", q, None)] total_count = len(no_wikidata) # Capturing this before truncating list no_wikidata = no_wikidata[:100] page = pywikibot.Page(bot, "User:Reports_bot/No_Wikidata_item") content = "'''Total Articles Missing From Wikidata:''' " + str(total_count) + "\n\n" for title in no_wikidata: content += ( "* [[" + title.replace("_", " ") + "]] ([https://www.wikidata.org/w/index.php?search=" + quote(title) + "&title=Special%3ASearch&fulltext=1 Search on Wikidata])\n" ) page.text = content page.save("Updating list", minor=False)
def main(rootpage, saveto): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') projects = [] output = 'These WikiProjects are not in any WikiProject meta-categories:\n\n' # Generating category whitelist wpcats = WikiProjectCategories() tree = wpcats.generate() whitelist = list(treegen(tree)) # Run through a simple generator function to produce a flat list whitelist = tuple(set(whitelist)) # De-duplicating and making into a tuple page = pywikibot.Page(bot, rootpage + '/All') contents = mwph.parse(page.text) contents = contents.filter_templates() for t in contents: if t.name.strip() == "WikiProject directory entry small": project = str(t.get('project').value).strip().replace(' ', '_') # Give me a list of all the categories, as long as it's on the whitelist query = wptools.query('wiki', "select distinct cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace in (4, 14) and page_title = {0} and cl_to in {1};".format('"' + project + '"', whitelist), None) if len(query) == 0: # If page is in none of the whitelisted categories output += "# [[Wikipedia:{0}|{0}]]\n".format(project.replace('_', ' ')) page = pywikibot.Page(bot, saveto) page.text = output page.save('Updating', minor=False)
def __init__(self): self.wptools = WikiProjectTools() q = ( 'create table if not exists notifications ' '(n_id int(11) NOT NULL auto_increment, ' 'n_project VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_variant VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_content TEXT character set utf8 collate utf8_unicode_ci, ' 'primary key (n_id)) ' 'engine=innodb character set=utf8;') #self.wptools.query('index', q, None) self.bot = pywikibot.Site('en', 'wikipedia', user='******') # Recognized notification variants # A variant that is not any of these kinds will cause an error # variantname --> template parameter name date = datetime.datetime.utcnow().strftime('%d %B %Y') self.contentwrapper = '<div style="max-width:500px; padding-bottom:2.5em;">' self.recognizedvariants = {'newmember': \ 'notification_when_a_new_member_joins', \ 'newdiscussion': \ 'notification_when_a_new_discussion_topic_is_posted'} self.varianttext = {'newmember': \ '==New member report for ' + date + '==\n' + self.contentwrapper + 'The following users joined the WikiProject in the past day:\n', \ 'newdiscussion': \ '==New discussion report for ' + date + '==\n' + self.contentwrapper + 'New discussions that are of interest to the WikiProject:\n'}
def main(rootpage): bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() config = json.loads(wptools.query('index', 'select json from config;', None)[0][0]) postto = [] # In this loop, *project* is a dictionary of configurations for project in config['projects']: if 'suggestbot' in project: # Is the key even defined? if project['suggestbot'] == True and project['type'] == 'Category': postto.append(project['name']) page = pywikibot.Page(bot, rootpage + '/SuggestFarm/' + project['name'][10:]) page.text = "{{{{User:SuggestBot/suggest|Category:{0}}}}}".format(project['source']) page.save("Requesting latest recommendations from SuggestBot", minor=False) print("Sleeping for 30 minutes.") time.sleep(1800) # Sleeping 30 minutes to wait for SuggestBot to do its thing # In this loop, *project* is a string (the name of the project) for project in postto: page = pywikibot.Page(bot, rootpage + '/SuggestFarm/' + project[10:])\ # Isolating the table from the output table = page.text.split('{|', 1)[1] table = table.split('|}', 1)[0] table = '{|\n' + table + '\n|}' # Saving table to WikiProject page = pywikibot.Page(bot, project + '/Edit articles') page.text = '===Edit articles===\n{{WPX last updated|' + project + '/Edit articles' + '}}\n\n' + table page.save("Updating list", minor=False, async=True)
def main(rootpage, saveto): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') projects = [] output = 'These WikiProjects are not in any WikiProject meta-categories:\n\n' # Generating category whitelist wpcats = WikiProjectCategories() tree = wpcats.generate() whitelist = list(treegen(tree)) # Run through a simple generator function to produce a flat list whitelist = tuple(set(whitelist)) # De-duplicating and making into a tuple page = pywikibot.Page(bot, rootpage + '/All') contents = mwph.parse(page.text) contents = contents.filter_templates() for t in contents: if t.name.strip() == "WikiProject directory entry": project = str(t.get('project').value).strip().replace(' ', '_') # Give me a list of all the categories, as long as it's on the whitelist query = wptools.query('wiki', "select distinct cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace in (4, 14) and page_title = {0} and cl_to in {1};".format('"' + project + '"', whitelist), None) if len(query) == 0: # If page is in none of the whitelisted categories output += "# [[Wikipedia:{0}|{0}]]\n".format(project.replace('_', ' ')) page = pywikibot.Page(bot, saveto) page.text = output page.save('Updating', minor=False)
def main(): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') q = ('select page_title from page where page_namespace = 0 ' 'and page_is_redirect = 0 and page_title not in ' '(select page_title from page join page_props on pp_page = page_id ' 'where page_namespace = 0 and pp_propname = "wikibase_item") ' 'order by page_id;') no_wikidata = [ x[0].decode('utf-8') for x in wptools.query('wiki', q, None) ] total_count = len(no_wikidata) # Capturing this before truncating list no_wikidata = no_wikidata[:100] page = pywikibot.Page(bot, 'User:Reports_bot/No_Wikidata_item') content = "'''Total Articles Missing From Wikidata:''' " + str( total_count) + "\n\n" for title in no_wikidata: content += "* [[" + title.replace('_', ' ') + \ "]] ([https://www.wikidata.org/w/index.php?search=" + \ quote(title) + \ "&title=Special%3ASearch&fulltext=1 Search on Wikidata])\n" page.text = content page.save("Updating list", minor=False)
def main(): print("Loading...") wptools = WikiProjectTools() query = wptools.query('index', 'select pi_page, pi_project from projectindex;', None) pages = {} for row in query: pi_page = row[0] pi_project = row[1] try: pages[pi_project].append(pi_page) except KeyError: pages[pi_project] = [pi_page] # Compare! intersect_counts = {} regex = re.compile('/.*') for wikiproject_x in pages.keys(): # lol WikiProject X print("Working on: " + wikiproject_x) intersect_counts[wikiproject_x] = {} for wikiproject_y in pages.keys(): if wikiproject_x == wikiproject_y: continue # Don't compare a project to itself test1 = re.sub(regex, '', wikiproject_x) test2 = re.sub(regex, '', wikiproject_y) if test1 == test2: continue # Filters out comparisons where one is a subpage of another s = set(pages[wikiproject_x]) intersect_counts[wikiproject_x][wikiproject_y] = len( [n for n in pages[wikiproject_y] if n in s]) bot = pywikibot.Site('en', 'wikipedia') print("Sorting and saving...") for project in intersect_counts.keys(): # Sorts from highest to lowest ordered = sorted(intersect_counts[project].items(), key=operator.itemgetter(1), reverse=True) saveto = 'Wikipedia:Related_WikiProjects/' + project[10:] page = pywikibot.Page(bot, saveto) draft = '{{WPX header|color={{{1|#37f}}}|Related WikiProjects<noinclude>: [[' \ + project.replace('_', ' ') + '|]]</noinclude>}}\n' draft += '{{WPX list start|intro={{WPX last updated|' + saveto + '}}}}\n' for x in range(0, 10): if ordered[x][1] > 0: draft += "{{WPX block|color={{{1|#37f}}}|" \ + "largetext='''[[{0}|]]''' ([[Wikipedia:Related WikiProjects/{1}|view related]])|".format(ordered[x][0].replace('_', ' '), ordered[x][0].replace('_', ' ')[10:]) \ + "smalltext={0} articles in common}}}}\n".format(str(ordered[x][1])) draft += '{{WPX list end|more=' + saveto + '}}' if page.text != draft: page.text = draft page.save('Updating', minor=False, async=True)
def build_cat_tree(cat_name, max_depth=5): if max_depth == 0: return None wptools = WikiProjectTools() query = wptools.query('wiki', 'select distinct page.page_title from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 14 and cl_to = "{0}" and page_title like "%\_WikiProjects" and page_title not like "Inactive_%";'.format(cat_name), None) retval = {} for row in query: category = row[0].decode('utf-8') retval[category] = build_cat_tree(category, max_depth=max_depth-1) return retval
def __init__(self, viewdump=None): print("Initializing the Priority Predictor") self.wptools = WikiProjectTools() if viewdump == None: # If a dumped JSON file of pageviews is not specified self.dump = getviewdump(self.wptools, 'en', days=30) else: with open(viewdump, 'r') as f: self.dump = json.load( f) # Load pageviews from a dumped JSON file
def now(self): bot = pywikibot.Site('en', 'wikipedia') # Exports the contents of the wikiproject.json page. page = pywikibot.Page(bot, 'Wikipedia:WikiProject X/wikiproject.json') output = page.text # We now have the JSON blob, in string format. try: output = json.loads(output) except ValueError as ack: # If JSON is invalid self.stopthepresses(bot, str(ack)) # At this point, we have valid JSON at our disposal. But does it comply with the schema? schema = list(output['schema'].keys()) for setting in output['defaults']: if setting not in schema: self.stopthepresses(bot, 'Invalid setting {0} in default configuration.'.format(setting)) for entry in output['projects']: for setting in entry: if setting not in schema: self.stopthepresses(bot, 'Invalid setting {0} in project entry {1}'.format(setting, entry)) # If the script hasn't been killed yet, save to database. output = json.dumps(output) wptools = WikiProjectTools() wptools.query('index', 'create table config_draft (json mediumtext character set utf8 collate utf8_unicode_ci) engine=innodb character set=utf8;', None) wptools.query('index', 'insert into config_draft (json) values (%s);', (str(output),)) wptools.query('index', 'drop table if exists config', None) wptools.query('index', 'rename table config_draft to config', None)
def main(): print("Loading...") wptools = WikiProjectTools() query = wptools.query('index', 'select pi_page, pi_project from projectindex;', None) pages = {} for row in query: pi_page = row[0] pi_project = row[1] try: pages[pi_project].append(pi_page) except KeyError: pages[pi_project] = [pi_page] # Compare! intersect_counts = {} regex = re.compile('/.*') for wikiproject_x in pages.keys(): # lol WikiProject X print("Working on: " + wikiproject_x) intersect_counts[wikiproject_x] = {} for wikiproject_y in pages.keys(): if wikiproject_x == wikiproject_y: continue # Don't compare a project to itself test1 = re.sub(regex, '', wikiproject_x) test2 = re.sub(regex, '', wikiproject_y) if test1 == test2: continue # Filters out comparisons where one is a subpage of another s = set(pages[wikiproject_x]) intersect_counts[wikiproject_x][wikiproject_y] = len([n for n in pages[wikiproject_y] if n in s]) bot = pywikibot.Site('en', 'wikipedia') print("Sorting and saving...") for project in intersect_counts.keys(): # Sorts from highest to lowest ordered = sorted(intersect_counts[project].items(), key=operator.itemgetter(1), reverse=True) saveto = 'Wikipedia:Related_WikiProjects/' + project[10:] page = pywikibot.Page(bot, saveto) draft = '{{WPX header|color={{{1|#37f}}}|Related WikiProjects<noinclude>: [[' \ + project.replace('_', ' ') + '|]]</noinclude>}}\n' draft += '{{WPX list start|intro={{WPX last updated|' + saveto + '}}}}\n' for x in range(0, 10): if ordered[x][1] > 0: draft += "{{WPX block|color={{{1|#37f}}}|" \ + "largetext='''[[{0}|]]''' ([[Wikipedia:Related WikiProjects/{1}|view related]])|".format(ordered[x][0].replace('_', ' '), ordered[x][0].replace('_', ' ')[10:]) \ + "smalltext={0} articles in common}}}}\n".format(str(ordered[x][1])) draft += '{{WPX list end|more=' + saveto + '}}' if page.text != draft: page.text = draft page.save('Updating', minor=False, async=True)
def build_cat_tree(cat_name, max_depth=5): if max_depth == 0: return None wptools = WikiProjectTools() query = wptools.query( 'wiki', 'select distinct page.page_title from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 14 and cl_to = "{0}" and page_title like "%\_WikiProjects" and page_title not like "Inactive_%";' .format(cat_name), None) retval = {} for row in query: category = row[0].decode('utf-8') retval[category] = build_cat_tree(category, max_depth=max_depth - 1) return retval
def go(self): wptools = WikiProjectTools() # Get list of WikiProjects that also have a self-named category output = 'This report highlights discrepancies in WikiProject categorization between WikiProjects and their self-named categories.\n\n' query = 'select page_title from page left join redirect on page.page_id = redirect.rd_from where page_title like "WikiProject\_%" and page_namespace = 4 and page_title in (select page_title from page where page_title like "WikiProject\_%" and page_namespace = 14) and rd_title is null;' for row in wptools.query('wiki', query, None): project = row[0].decode('utf-8') cl_projectspace = [] # read as "category links, Wikipedia namespace" cl_categoryspace = [] # read as "category links, Category namespace" for match in wptools.query('wiki', 'select cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 4 and page_title = "{0}" and cl_to like "%\_WikiProjects" and cl_to not like "Active\_%" and cl_to not like "Semi-active\_%" and cl_to not like "Inactive\_%" and cl_to not like "Defunct\_%";'.format(project), None): cl_projectspace.append(match[0].decode('utf-8').replace('_', ' ')) for match in wptools.query('wiki', 'select cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 14 and page_title = "{0}" and cl_to like "%\_WikiProjects" and cl_to not like "Active\_%" and cl_to not like "Semi-active\_%" and cl_to not like "Inactive\_%" and cl_to not like "Defunct\_%";'.format(project), None): cl_categoryspace.append(match[0].decode('utf-8').replace('_', ' ')) cl_projectspace.sort() cl_categoryspace.sort() if cl_projectspace == cl_categoryspace: continue # Don't bother generating a report if both category lists match perfectly both = list(set(cl_projectspace).intersection(cl_categoryspace)) project = project.replace('_', ' ') output += "* '''{0}'''\n".format(project) output += "** [[Wikipedia:{0}]]: ".format(project) for entry in cl_projectspace: if entry in both: output += "<span style='color: #999'>{0}</span> – ".format(entry) else: output += "<span style='color: #FF0000'>{0}</span> – ".format(entry) output = output[:-2] + "\n" # Truncate trailing endash and add line break output += "** [[:Category:{0}]]: ".format(project) for entry in cl_categoryspace: if entry in both: output += "<span style='color: #999'>{0}</span> –".format(entry) else: output += "<span style='color: #FF0000'>{0}</span> –".format(entry) output = output[:-2] + "\n" # Truncate trailing endash and add line break return output
def main(): print("Loading...") wptools = WikiProjectTools() query = wptools.query('index', 'select pi_page, pi_project from projectindex;', None) pages = {} for row in query: pi_page = row[0] pi_project = row[1] try: pages[pi_project].append(pi_page) except KeyError: pages[pi_project] = [pi_page] # Compare! intersect_counts = {} regex = re.compile('/.*') for wikiproject_x in pages.keys(): # lol WikiProject X print("Working on: " + wikiproject_x) intersect_counts[wikiproject_x] = {} for wikiproject_y in pages.keys(): if wikiproject_x == wikiproject_y: continue # Don't compare a project to itself test1 = re.sub(regex, '', wikiproject_x) test2 = re.sub(regex, '', wikiproject_y) if test1 == test2: continue # Filters out comparisons where one is a subpage of another s = set(pages[wikiproject_x]) intersect_counts[wikiproject_x][wikiproject_y] = len([n for n in pages[wikiproject_y] if n in s]) bot = pywikibot.Site('en', 'wikipedia') print("Sorting and saving...") for project in intersect_counts.keys(): # Sorts from highest to lowest ordered = sorted(intersect_counts[project].items(), key=operator.itemgetter(1), reverse=True) saveto = 'Wikipedia:Related_WikiProjects/' + project[10:] page = pywikibot.Page(bot, saveto) draft = '' for x in range(0, 10): if ordered[x][1] > 0: draft += "* '''[[{0}|{1}]]''': {2} articles in common\n".format(ordered[x][0], ordered[x][0][10:].replace('_', ' '), str(ordered[x][1])) if page.text != draft: page.text = draft page.save('Updating', minor=False, async=True)
def __init__(self): self.wptools = WikiProjectTools() q = ('create table if not exists notifications ' '(n_id int(11) NOT NULL auto_increment, ' 'n_project VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_variant VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_content TEXT character set utf8 collate utf8_unicode_ci, ' 'primary key (n_id)) ' 'engine=innodb character set=utf8;') #self.wptools.query('index', q, None) self.bot = pywikibot.Site('en', 'wikipedia', user='******') # Recognized notification variants # A variant that is not any of these kinds will cause an error # variantname --> template parameter name date = datetime.datetime.utcnow().strftime('%d %B %Y') self.contentwrapper = '<div style="max-width:500px; padding-bottom:2.5em;">' self.recognizedvariants = {'newmember': \ 'notification_when_a_new_member_joins', \ 'newdiscussion': \ 'notification_when_a_new_discussion_topic_is_posted'} self.varianttext = {'newmember': \ '==New member report for ' + date + '==\n' + self.contentwrapper + 'The following users joined the WikiProject in the past day:\n', \ 'newdiscussion': \ '==New discussion report for ' + date + '==\n' + self.contentwrapper + 'New discussions that are of interest to the WikiProject:\n'}
def __init__(self, viewdump=None): print("Initializing the Priority Predictor") self.wptools = WikiProjectTools() if viewdump == None: # If a dumped JSON file of pageviews is not specified self.dump = getviewdump(self.wptools, 'en', days=30) else: with open(viewdump, 'r') as f: self.dump = json.load(f) # Load pageviews from a dumped JSON file
def __init__(self): self.bot = pywikibot.Site('en', 'wikipedia') self.wptools = WikiProjectTools() self.projects = [] self.predictorseed = {} self.unknownquality = {} self.unknownpriority = {} self.config = self.wptools.query('index', 'select json from config;', None) self.config = json.loads(self.config[0][0]) for entry in self.config['projects']: if 'assessment_tools' in entry \ and 'at_category' in entry \ and 'at_unknown_quality' in entry \ and 'at_unknown_priority' in entry: project = entry['name'][10:] # Normalizing title self.projects.append(project) self.predictorseed[project] = entry['at_category'].replace(' ', '_') self.unknownquality[project] = entry['at_unknown_quality'].replace(' ', '_') self.unknownpriority[project] = entry['at_unknown_priority'].replace(' ', '_')
def main(): wptools = WikiProjectTools() bot = pwb.Site("en", "wikipedia", user="******") # Generate list of WikiProjects with eponymous categories q = ( "select page_title from page where page_namespace = 14 " "and page_title in (select page_title from page where " 'page_namespace = 4 and page_title like "WikiProject_%" ' "and page_is_redirect = 0);" ) pairs = [row[0].decode("utf-8") for row in wptools.query("wiki", q, None)] for pair in pairs: # Load WikiProject page project_page = pwb.Page(bot, "Wikipedia:" + pair) # Preserve only categories that aren't in the style "X WikiProjects" preserve = [c for c in pwb.textlib.getCategoryLinks(project_page.text) if str(c)[-15:] != " WikiProjects]]"] # Check for presence of removable categories; otherwise, don't bother if preserve != pwb.textlib.getCategoryLinks(project_page.text): # Load WikiProject category project_cat = pwb.Page(bot, "Category:" + pair) # List categories to add to project category page_cats = [ c for c in pwb.textlib.getCategoryLinks(project_page.text) if str(c)[-15:] == " WikiProjects]]" ] cat_cats = [c for c in pwb.textlib.getCategoryLinks(project_cat.text) if str(c)[-15:] == " WikiProjects]]"] to_add = list(set(page_cats) - set(cat_cats)) # Make changes and save page project_cat.text = pwb.textlib.replaceCategoryLinks(project_cat.text, to_add, addOnly=True) project_page.text = pwb.textlib.replaceCategoryLinks(project_page.text, preserve) summary = "WikiProject category migration. See [[User:Harej bot/WikiProject category migration]]." project_page.save(summary, minor=False) project_cat.save(summary, minor=False)
def main(): wptools = WikiProjectTools() bot = pwb.Site('en', 'wikipedia', user='******') # Generate list of WikiProjects with eponymous categories q = ('select page_title from page where page_namespace = 14 ' 'and page_title in (select page_title from page where ' 'page_namespace = 4 and page_title like "WikiProject_%" ' 'and page_is_redirect = 0);') pairs = [row[0].decode('utf-8') for row in wptools.query('wiki', q, None)] for pair in pairs: # Load WikiProject page project_page = pwb.Page(bot, 'Wikipedia:' + pair) # Preserve only categories that aren't in the style "X WikiProjects" preserve = [c for c in pwb.textlib.getCategoryLinks(project_page.text) \ if str(c)[-15:] != ' WikiProjects]]'] # Check for presence of removable categories; otherwise, don't bother if preserve != pwb.textlib.getCategoryLinks(project_page.text): # Load WikiProject category project_cat = pwb.Page(bot, 'Category:' + pair) # List categories to add to project category page_cats = [c for c in pwb.textlib.getCategoryLinks(project_page.text) \ if str(c)[-15:] == ' WikiProjects]]'] cat_cats = [c for c in pwb.textlib.getCategoryLinks(project_cat.text) \ if str(c)[-15:] == ' WikiProjects]]'] to_add = list(set(page_cats) - set(cat_cats)) # Make changes and save page project_cat.text = pwb.textlib.replaceCategoryLinks(project_cat.text, to_add, addOnly=True) project_page.text = pwb.textlib.replaceCategoryLinks(project_page.text, preserve) summary = "WikiProject category migration. See [[User:Harej bot/WikiProject category migration]]." project_page.save(summary, minor=False) project_cat.save(summary, minor=False)
def __init__(self): self.bot = pywikibot.Site("en", "wikipedia") self.wptools = WikiProjectTools() self.projects = [] self.predictorseed = {} self.unknownquality = {} self.unknownpriority = {} self.config = self.wptools.query("index", "select json from config;", None) self.config = json.loads(self.config[0][0]) for entry in self.config["projects"]: if ( "assessment_tools" in entry and "at_category" in entry and "at_unknown_quality" in entry and "at_unknown_priority" in entry ): project = entry["name"][10:] # Normalizing title self.projects.append(project) self.predictorseed[project] = entry["at_category"].replace(" ", "_") self.unknownquality[project] = entry["at_unknown_quality"].replace(" ", "_") self.unknownpriority[project] = entry["at_unknown_priority"].replace(" ", "_")
class WikidataMagic: def __init__(self): self.wptools = WikiProjectTools() self.bot = pywikibot.Site('en', 'wikipedia') def entitydata(self, item): url = 'https://www.wikidata.org/wiki/Special:EntityData/' + item + \ '.json' r = requests.get(url) return r.json() def wikidataquery(self, query): url = 'https://wdq.wmflabs.org/api?q=' + query r = requests.get(url) return ['Q' + str(item) for item in r.json()['items']] def missing_from_enwiki(self, total_item_list): q = ("select pp_value from page_props " "where pp_propname = 'wikibase_item' and pp_value in {0}") q = q.format(tuple(total_item_list)) on_enwiki = [item[0].decode('utf-8') \ for item in self.wptools.query('wiki', q, None) \ if item != None] return list(set(total_item_list) - set(on_enwiki)) def missing_articles_report(self): config = self.wptools.query('index', 'select json from config;', None) config = json.loads(config[0][0]) for entry in config['projects']: if 'wikidata_missing_articles' in entry: wikiproject = entry['name'] # e.g. "Wikipedia:WikiProject Something" wdq_query = entry['wikidata_missing_articles'] # Coming up with list of Wikidata items of missing articles items_for_report = self.wikidataquery(wdq_query) items_for_report = self.missing_from_enwiki(items_for_report) items_for_report = items_for_report[:100] # Truncate list # Generate the report itself! save_to = wikiproject + "/Tasks/Wikidata Missing Article Report" content = ("{{WPX list start|title=From Wikidata|" "intro=Automatically generated list of missing articles" "<br />{{WPX last updated|" + save_to + "}}}}\n" "{{#invoke:<includeonly>random|list|limit=3" "</includeonly><noinclude>list|unbulleted</noinclude>\n") for item in items_for_report: data = self.entitydata(item) data = data['entities'][item] if 'labels' in data: if 'en' in data['labels']: label = "[[" + data['labels']['en']['value'] + "]]" else: label = "No English title available" else: label = "No English title available" if 'descriptions' in data: if 'en' in data['descriptions']: description = data['descriptions']['en']['value'] else: description = "No English description available" else: description = "No English Description available" content += "| {{WPX block|largetext='''" + label + \ "'''|smalltext=" + description + \ "<br />([[d:" + item + \ "|More information on Wikidata]])" + \ "|color={{{1|#37f}}}}}\n" # Wrap up report and save content += "}}\n{{WPX list end|more=" + save_to + "}}" page = pywikibot.Page(self.bot, save_to) page.text = content page.save("Updating task list", minor=False, async=True)
def prepare(self, saveto): wptools = WikiProjectTools() bot = pywikibot.Site('en', 'wikipedia') # Retrieve list of WikiProjects projects = [] for row in wptools.query( 'index', 'select distinct pi_project from projectindex;', None): projects.append(row[0]) runtime = datetime.datetime.utcnow().strftime('%H:%M, %d %B %Y (UTC)') q = ('select distinct page.page_title from page ' 'join categorylinks on page.page_id = categorylinks.cl_from ' 'left join redirect on page.page_id = redirect.rd_from ' 'where page_namespace = 4 ' 'and page_title not like "%/%" ' 'and rd_title is null ' 'and (cl_to in ' '(select page.page_title from page ' 'where page_namespace = 14 and ' 'page_title like "%\_WikiProjects" ' 'and page_title not like "%\_for\_WikiProjects" ' 'and page_title not like "%\_of\_WikiProjects") ' 'or page_title like "WikiProject\_%");') formaldefinition = wptools.query( 'wiki', q, None) # http://quarry.wmflabs.org/query/3509 for row in formaldefinition: row = row[0].decode('utf-8') if row not in projects: projects.append(row) projects.sort() packages = [projects[i:i + 50] for i in range(0, len(projects), 50)] report = {} for package in packages: url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=info&inprop=watchers&titles=" for title in package: url += title + "|" url = url[:-1] # Truncate trailing pipe apiquery = requests.get(url) apiquery = apiquery.json() for pagedata in apiquery['query']['pages'].values(): if 'watchers' in pagedata: if pagedata['watchers'] > 29: # Required part report[pagedata['title']] = pagedata['watchers'] report = sorted(report.items(), key=operator.itemgetter(1), reverse=True) contents = 'List of WikiProjects by number of watchers of its main page and talk page. A WikiProject not appearing on this list has fewer than 30 watchers. Data as of <onlyinclude>' + runtime + '</onlyinclude>' contents += '\n\n{| class="wikitable sortable plainlinks"\n|-\n! No.\n! WikiProject\n! Watchers\n' counter = 0 for pair in report: counter += 1 contents += "|-\n| {0}\n| [[{1}]]\n| {2}\n".format( str(counter), pair[0], pair[1]) contents += "|}" page = pywikibot.Page(bot, saveto) page.text = contents page.save("Updating report", minor=False)
class WikiProjectNotifications: def __init__(self): self.wptools = WikiProjectTools() q = ( 'create table if not exists notifications ' '(n_id int(11) NOT NULL auto_increment, ' 'n_project VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_variant VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_content TEXT character set utf8 collate utf8_unicode_ci, ' 'primary key (n_id)) ' 'engine=innodb character set=utf8;') #self.wptools.query('index', q, None) self.bot = pywikibot.Site('en', 'wikipedia', user='******') # Recognized notification variants # A variant that is not any of these kinds will cause an error # variantname --> template parameter name date = datetime.datetime.utcnow().strftime('%d %B %Y') self.contentwrapper = '<div style="max-width:500px; padding-bottom:2.5em;">' self.recognizedvariants = {'newmember': \ 'notification_when_a_new_member_joins', \ 'newdiscussion': \ 'notification_when_a_new_discussion_topic_is_posted'} self.varianttext = {'newmember': \ '==New member report for ' + date + '==\n' + self.contentwrapper + 'The following users joined the WikiProject in the past day:\n', \ 'newdiscussion': \ '==New discussion report for ' + date + '==\n' + self.contentwrapper + 'New discussions that are of interest to the WikiProject:\n'} def active_user(self, username): ''' Determines if a username meets a basic threshold of activity Takes string *username*, returns boolean Threshold is one edit in the recent changes tables (i.e. in the past 30 days) ''' q = 'select count(*) from recentchanges_userindex where rc_user_text = "{0}"'.format( username.replace('_', ' ')) if self.wptools.query('wiki', q, None)[0][0] > 0: return True else: return False def post(self, project, variant, content): ''' Adds an item to the WikiProject Notification Center, to be included in the next update ''' if variant in self.recognizedvariants: q = 'insert into notifications (n_project, n_variant, n_content) values ("{0}", "{1}", "{2}");' q = q.format(project, variant, content) self.wptools.query('index', q, None) else: raise NotificationVariantError(variant) def findsubscribers(self): ''' Generates a dictionary of WikiProjects with notification centers and corresponding report subscribers ''' q = ( 'select page_title from templatelinks ' 'join page on page_id = tl_from and page_namespace = tl_from_namespace ' 'where page_namespace = 2 and tl_namespace = 10 ' 'and tl_title = "WikiProjectCard";') output = {} for row in self.wptools.query('wiki', q, None): title = row[0].decode('utf-8') components = title.split( '/' ) # e.g. ['Harej', 'WikiProjectCards', 'WikiProject_Women_in_Technology'] username = components[0] # No notifications for inactive users if self.active_user(username) == False: continue # Carrying on... title = "User: "******"New notification", minor=False, async=True) # Deleting old records now that notifications have been sent out if len(id_to_delete) > 0: if len(id_to_delete) == 1: self.wptools.query( 'index', 'delete from notifications where n_id = {0};'.format( id_to_delete[0]), None) else: self.wptools.query( 'index', 'delete from notifications where n_id in {0};'.format( tuple(id_to_delete)), None)
def main(self, rootpage): # Initializing... bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() config = json.loads(wptools.query('index', 'select json from config;', None)[0][0]) # Get list of people who opted out optout = pywikibot.Page(bot, 'User:Reports bot/Opt-out') blacklist = [] regexes =[re.findall('\[\[User:(.*?)\|',optout.text,re.I), re.findall('\{\{user\|(.*?)\}\}',optout.text,re.I), re.findall('\[\[:User:(.*?)\]',optout.text,re.I), re.findall('\[\[:User talk:(.*?)\]',optout.text,re.I)] for results in regexes: for user in results: blacklist.append(user) # Bots are to be excluded for result in wptools.query('wiki', "select user_name from user_groups left join user on user_id = ug_user where ug_group = 'bot';", None): blacklist.append(result[0].decode('utf-8')) # List of projects we are working on # Methodology: List from Project Index + List from Formal Definition, minus duplicates # This will cover all of our bases. articles = {} counter = 0 while True: # I am a bad man for doing this query = wptools.query('index', 'select pi_page, pi_project from projectindex where pi_id > {0} and pi_id <= {1};'.format(counter, counter+1000000), None) if len(query) == 0: break for pair in query: # Normalizing by getting rid of namespace page = pair[0] page = page.replace('Draft_talk:', '') page = page.replace('Talk:', '') proj = pair[1][10:] # Normalizing by getting rid of "Wikipedia:" try: articles[proj].append(page) except KeyError: articles[proj] = [page] counter += 1000000 projects = [project for project in articles.keys()] q = ('select distinct page.page_title from page ' 'join categorylinks on page.page_id = categorylinks.cl_from ' 'left join redirect on page.page_id = redirect.rd_from ' 'where page_namespace = 4 ' 'and page_title not like "%/%" ' 'and rd_title is null ' 'and (cl_to in ' '(select page.page_title from page ' 'where page_namespace = 14 and ' 'page_title like "%\_WikiProjects" ' 'and page_title not like "%\_for\_WikiProjects" ' 'and page_title not like "%\_of\_WikiProjects") ' 'or page_title like "WikiProject\_%");') formaldefinition = wptools.query('wiki', q, None) # http://quarry.wmflabs.org/query/3509 for row in formaldefinition: row = row[0].decode('utf-8') if row not in projects: projects.append(row) projects.sort() directories = {'All': ''} # All projects, plus subdirectories to be defined below. directoryrow = {} # Alright! Let's run some reports! for project in projects: # Seeding directory row and profile page if project not in articles: articles[project] = [] project_normalized = project.replace('_', ' ') # List of active project participants (less blacklist) wp_editors = [] start_date = time.strftime('%Y%m%d000000',time.gmtime(time.time()-(60*60*24*90))) # 90 days end_date = time.strftime('%Y%m%d000000',time.gmtime(time.time())) # Today query = "select rev_user_text from page left join revision on page_id = rev_page where (page_namespace = 4 OR page_namespace = 5) and (page_title like \"{0}/%%\" OR page_title = \"{0}\") and rev_timestamp > {1} and rev_timestamp < {2} group by rev_user_text HAVING count(*) > 1;".format(project, start_date, end_date) for result in wptools.query('wiki', query, None): if result[0] is not None: user = result[0].decode('utf-8') if user not in blacklist: wp_editors.append(user) wp_editors.sort() # List of active subject area editors (less blacklist) start_date = time.strftime('%Y%m%d000000',time.gmtime(time.time()-(60*60*24*30))) # 30 days end_date = time.strftime('%Y%m%d000000',time.gmtime(time.time())) # Today if len(articles[project]) > 0: subject_editors = [] packages = [] for i in range(0, len(articles[project]), 10000): packages.append(articles[project][i:i+10000]) counter = 0 for package in packages: counter += 1 if len(package) > 1: query_builder = 'select rev_user_text from page left join revision on page_id = rev_page where page_namespace in (0, 1, 118, 119) and page_title in {0} and rev_timestamp > {1} and rev_timestamp < {2} order by rev_user_text;'.format(tuple(package), start_date, end_date) else: query_builder = 'select rev_user_text from page left join revision on page_id = rev_page where page_namespace in (0, 1, 118, 119) and page_title = "{0}" and rev_timestamp > {1} and rev_timestamp < {2} order by rev_user_text;'.format(package[0], start_date, end_date) for result in wptools.query('wiki', query_builder, None): if result[0] is not None: subject_editors.append(result[0].decode('utf-8')) subject_editors = dict(Counter(subject_editors)) # Convert the list to a dictionary with username as key and edit count as value subject_editors_filtered = [] for user in subject_editors.keys(): if user not in blacklist: if subject_editors[user] > 4: subject_editors_filtered.append(user) subject_editors = subject_editors_filtered # And now assigned back. subject_editors.sort() else: subject_editors = [] # Generate and Save Profile Page wp_editors_formatted = "" subject_editors_formatted = "" if len(wp_editors) > 0: for editor in wp_editors: wp_editors_formatted += "\n* [[User:{0}|{0}]] ([[User talk:{0}|talk]])".format(editor) else: wp_editors_formatted = "" if len(subject_editors) > 0 and len(subject_editors) < 3200: for editor in subject_editors: subject_editors_formatted += "\n* [[User:{0}|{0}]] ([[User talk:{0}|talk]])".format(editor) else: subject_editors_formatted = "" profilepage = "{{{{WikiProject description page | project = {0} | list_of_active_wikiproject_participants = {1} | list_of_active_subject_area_editors = {2}}}}}".format(project_normalized, wp_editors_formatted, subject_editors_formatted) page = pywikibot.Page(bot, rootpage + '/Description/' + project_normalized) if profilepage != page.text: # Checking to see if a change was made to cut down on API queries page.text = profilepage page.save('Updating', minor=False, async=True, quiet=True) # Construct directory entry directoryrow[project] = "{{{{WikiProject directory entry | project = {0} | number_of_articles = {1} | wp_editors = {2} | scope_editors = {3}}}}}\n".format(project_normalized, len(articles[project]), len(wp_editors), len(subject_editors)) # Assign directory entry to relevant directory pages ("All entries" and relevant subdirectory pages) for entry in sorted(directoryrow.items(), key=operator.itemgetter(1)): # Sorting into alphabetical order directories['All'] += entry[1] directories['All'] = "{{WikiProject directory top}}\n" + directories['All'] + "|}" wpcats = WikiProjectCategories() tree = wpcats.generate() index_primary = sorted([key for key in tree.keys()]) index_secondary = {} indextext = "'''[[{0}/All|All WikiProjects]]'''\n\n".format(rootpage) for firstlevel in tree.keys(): directories[firstlevel] = "={0}=\n".format(firstlevel.replace('_', ' ' )) directories[firstlevel] += self.listpull(wptools, projects, directoryrow, firstlevel) # For immmedate subcats of WikiProjects_by_area directories[firstlevel] += self.treeiterator(wptools, tree[firstlevel], projects, directoryrow, firstlevel) # For descendants of those immediate subcats. index_secondary[firstlevel] = sorted([key for key in tree[firstlevel].keys()]) # Updating the directory index for firstlevel in index_primary: firstlevel_normalized = firstlevel.replace('_', ' ') indextext += ";[[{0}/{1}|{1}]]".format(rootpage, firstlevel_normalized) if len(tree[firstlevel]) > 0: indextext += " : " for secondlevel in index_secondary[firstlevel]: indextext += "[[{0}/{1}#{2}|{2}]] – ".format(rootpage, firstlevel_normalized, secondlevel.replace('_', ' ')) indextext = indextext[:-3] # Truncates trailing dash and is also a cute smiley face indextext += "\n\n" saveindex = pywikibot.Page(bot, 'Template:WikiProject directory index') saveindex.text = indextext saveindex.save('Updating', minor=False, async=True, quiet=True) # Generate directories and save! for directory in directories.keys(): contents = directories[directory] page = pywikibot.Page(bot, rootpage + "/" + directory) if contents != page.text: # Checking to see if a change was made to cut down on API save queries oldcontents = page.text page.text = contents page.save('Updating', minor=False, async=True, quiet=True) # Cleanup of obsolete description pages and "Related WikiProjects" pages if directory == 'All': oldcontents = mwph.parse(oldcontents) oldcontents = oldcontents.filter_templates() oldprojectlist = [] for t in oldcontents: if t.name.strip() == "WikiProject directory entry": oldprojectlist.append(str(t.get('project').value)) for oldproject in oldprojectlist: oldproject = oldproject.strip().replace(' ', '_') # Normalizing if oldproject not in projects: deletethis = pywikibot.Page(bot, rootpage + '/Description/' + oldproject) deletethis.text = "{{db-g6|rationale=A bot has automatically tagged this page as obsolete. This means that the WikiProject described on this page has been deleted or made into a redirect}}\n" deletethis.save('Nominating page for deletion', minor=False, async=True, quiet=True) deletethis = pywikibot.Page(bot, 'Wikipedia:Related WikiProjects/' + oldproject) if deletethis.text != "": deletethis.text = "{{db-g6|rationale=A bot has automatically tagged this page as obsolete. This means that the WikiProject described on this page has been deleted or made into a redirect}}\n" deletethis.save('Nominating page for deletion', minor=False, async=True, quiet=True)
def __init__(self): self.wptools = WikiProjectTools() self.wpn = WikiProjectNotifications()
class WikiProjectMembers: def __init__(self): self.wptools = WikiProjectTools() self.wpn = WikiProjectNotifications() def queue_notification(self, project, username): ''' Queue new member notification ''' content = "* User:"******"newmember", content) def run(self): bot = pywikibot.Site('en', 'wikipedia') q = ('select page_title from templatelinks ' 'join page on page_id = tl_from and page_namespace = tl_from_namespace ' 'where page_namespace = 2 and tl_namespace = 10 ' 'and tl_title = "WikiProjectCard";') # Generate list of WikiProjects and members through the WikiProjectCard system members = {} for row in self.wptools.query('wiki', q, None): title = row[0].decode('utf-8') components = title.split('/') # e.g. ['Harej', 'WikiProjectCards', 'WikiProject_Women_in_Technology'] title = "User: "******"{{{{Clickable button 2|Wikipedia:{0}|Return to WikiProject|class=mw-ui-neutral}}}}<span class='wp-formsGadget mw-ui-button mw-ui-progressive' data-mode='create' data-type='Join'>Join WikiProject</span>".format(wikiproject) lua_garbage = "{{#invoke:<includeonly>random|list|limit=3</includeonly><noinclude>list|unbulleted</noinclude>|" active = "<noinclude>" + return_to_wikiproject + "\n\n<div style='padding-top:1.5em; padding-bottom:2em;'>Our WikiProject members are below. Those who have not edited Wikipedia in over a month are moved to the [[Wikipedia:{0}/Members/Inactive|inactive members list]].</div>\n\n</noinclude>".format(wikiproject) + lua_garbage inactive = "<noinclude>" + return_to_wikiproject + "\n\n<div style='padding-top:1.5em; padding-bottom:2em;'>These are our members who have not edited in a while. Once they edit again, they will be moved back to the [[Wikipedia:{0}/Members|active members list]].</div>\n\n</noinclude>".format(wikiproject) + lua_garbage for member in members[wikiproject]: addition = "{{User:"******"/WikiProjectCards/" + wikiproject + "<includeonly>|mode=compact</includeonly>}}|" if self.wpn.active_user(member): active += addition else: inactive += addition active = active[:-1] + "}}" # removing trailing pipe and closing off module inactive += "}}" # Generate old list to prepare a diff page_active = pywikibot.Page(bot, "Wikipedia:" + wikiproject + "/Members") page_inactive = pywikibot.Page(bot, "Wikipedia:" + wikiproject + "/Members/Inactive") oldnames = [] for text in [page_active.text, page_inactive.text]: contents = mwparserfromhell.parse(text) contents = contents.filter_templates() for t in contents: if t.name[:5] == "User:"******"/")[0][5:]) # i.e. grab username from template newnames = list(set(members[wikiproject]) - set(oldnames)) newnames.sort() print(newnames) # Anyone in the *newnames* set is a new user. Queue the notification! for member in newnames: self.queue_notification(wikiproject, member) # Now, save pages. page_active.text = active page_active.save("Updating member list", minor=False, async=True) page_inactive.text = inactive page_inactive.save("Updating member list", minor=False, async=True)
def go(self): wptools = WikiProjectTools() # Get list of WikiProjects that also have a self-named category output = 'This report highlights discrepancies in WikiProject categorization between WikiProjects and their self-named categories.\n\n' query = 'select page_title from page left join redirect on page.page_id = redirect.rd_from where page_title like "WikiProject\_%" and page_namespace = 4 and page_title in (select page_title from page where page_title like "WikiProject\_%" and page_namespace = 14) and rd_title is null;' for row in wptools.query('wiki', query, None): project = row[0].decode('utf-8') cl_projectspace = [ ] # read as "category links, Wikipedia namespace" cl_categoryspace = [ ] # read as "category links, Category namespace" for match in wptools.query( 'wiki', 'select cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 4 and page_title = "{0}" and cl_to like "%\_WikiProjects" and cl_to not like "Active\_%" and cl_to not like "Semi-active\_%" and cl_to not like "Inactive\_%" and cl_to not like "Defunct\_%";' .format(project), None): cl_projectspace.append(match[0].decode('utf-8').replace( '_', ' ')) for match in wptools.query( 'wiki', 'select cl_to from categorylinks join page on categorylinks.cl_from=page.page_id where page_namespace = 14 and page_title = "{0}" and cl_to like "%\_WikiProjects" and cl_to not like "Active\_%" and cl_to not like "Semi-active\_%" and cl_to not like "Inactive\_%" and cl_to not like "Defunct\_%";' .format(project), None): cl_categoryspace.append(match[0].decode('utf-8').replace( '_', ' ')) cl_projectspace.sort() cl_categoryspace.sort() if cl_projectspace == cl_categoryspace: continue # Don't bother generating a report if both category lists match perfectly both = list(set(cl_projectspace).intersection(cl_categoryspace)) project = project.replace('_', ' ') output += "* '''{0}'''\n".format(project) output += "** [[Wikipedia:{0}]]: ".format(project) for entry in cl_projectspace: if entry in both: output += "<span style='color: #999'>{0}</span> – ".format( entry) else: output += "<span style='color: #FF0000'>{0}</span> – ".format( entry) output = output[:-2] + "\n" # Truncate trailing endash and add line break output += "** [[:Category:{0}]]: ".format(project) for entry in cl_categoryspace: if entry in both: output += "<span style='color: #999'>{0}</span> –".format( entry) else: output += "<span style='color: #FF0000'>{0}</span> –".format( entry) output = output[:-2] + "\n" # Truncate trailing endash and add line break return output
class WikiProjectAssess: def __init__(self): self.bot = pywikibot.Site("en", "wikipedia") self.wptools = WikiProjectTools() self.projects = [] self.predictorseed = {} self.unknownquality = {} self.unknownpriority = {} self.config = self.wptools.query("index", "select json from config;", None) self.config = json.loads(self.config[0][0]) for entry in self.config["projects"]: if ( "assessment_tools" in entry and "at_category" in entry and "at_unknown_quality" in entry and "at_unknown_priority" in entry ): project = entry["name"][10:] # Normalizing title self.projects.append(project) self.predictorseed[project] = entry["at_category"].replace(" ", "_") self.unknownquality[project] = entry["at_unknown_quality"].replace(" ", "_") self.unknownpriority[project] = entry["at_unknown_priority"].replace(" ", "_") def qualitypredictor(self, pagetitles): """ Makes a query to ORES that predicts the quality of an article. Takes list *pagetitles* as input Returns a list of tuples (title, prediction) Input MUST be a list. If only one title, enter it as [title] """ output = [] # Split into packages packages = [pagetitles[i : i + 50] for i in range(0, len(pagetitles), 50)] for package in packages: if len(package) > 1: q = ( "select page_title, page_latest from page " "where page_namespace = 0 and page_title in {0} " "order by page_title limit 100;" ).format(tuple(package)) else: q = ( "select page_title, page_latest from page " "where page_namespace = 0 " 'and page_title = "{0}";' ).format(package[0]) revision_ids = {str(row[1]): row[0].decode("utf-8") for row in self.wptools.query("wiki", q, None)} api_input = [rev_id for rev_id in revision_ids.keys()] api_url = "http://ores.wmflabs.org/scores/enwiki/wp10/?revids=" for rev_id in api_input: api_url += rev_id + "|" api_url = api_url[:-1] # Truncating extra vertical pipe query = requests.get(api_url) query = query.json() for rev_id, result in query.items(): pair = (revision_ids[rev_id], result["prediction"]) output.append(pair) return output def qualitylist(self): for wikiproject, category in self.unknownquality.items(): save_to = "User:Reports bot/" + wikiproject + "/Assessment/Assess for quality" q = ( "select page_title from categorylinks " "join page on cl_from = page_id " 'where cl_to = "{0}";' ).format(category.replace(" ", "_")) to_process = [row[0].decode("utf-8") for row in self.wptools.query("wiki", q, None)] to_process = self.qualitypredictor(to_process) contents = ( "{{WPX list start|color={{{2|#086}}}<includeonly>|constrained=yes</includeonly>|title=Assess for quality" "|intro=Determine the quality of these articles<br />" "{{WPX last updated|" + save_to + "}}}}<br />\n\n" "{{#invoke:<includeonly>random|list|limit=3" "</includeonly><noinclude>list|unbulleted</noinclude>|" ) for pair in to_process: article = pair[0].replace("_", " ") prediction = pair[1] contents += ( "{{WPX block|color={{{1|#37f}}}|largetext=<b>[[" + article + "]]</b> " + "([[Talk:" + article + "|talk]])|smalltext=" + "Predicted class: " + prediction + "}}|" ) contents = contents[:-1] + "}}\n{{WPX list end|more=" + save_to + "}}" page = pywikibot.Page(self.bot, save_to) page.text = contents page.save("Updating listing", minor=False, async=True) def scopepredictor(self): for wikiproject, category in self.predictorseed.items(): category_recs = [] article_recs = [] # This query produces a list of pages that belong to categories that # have been tagged by the WikiProject q = ( "select page_namespace, page_title from page " "join categorylinks on categorylinks.cl_from = page.page_id " "where page_namespace in (0, 14) " "and cl_to in ( " "select page.page_title from page " "join categorylinks on categorylinks.cl_from = page.page_id " "where page_namespace = 15 " 'and cl_to = "{0}");' ).format(category) for row in self.wptools.query("wiki", q, None): ns = row[0] page = row[1].decode("utf-8") if ns == 0: article_recs.append(page) elif ns == 14: category_recs.append(page) # Filter against these lists: q = "select pi_page from projectindex " 'where pi_project = "Wikipedia:{0}";' q = q.format(wikiproject.replace(" ", "_")) article_filter = [ row[0].replace("Talk:", "") for row in self.wptools.query("index", q, None) if row[0].startswith("Talk") ] q = ( "select page_title from page " "join categorylinks on cl_from = page_id " "where page_namespace = 15 " 'and cl_to = "{0}";' ).format(category) category_filter = [row[0].decode("utf-8") for row in self.wptools.query("wiki", q, None)] # Now do the filtering... category_recs = list(set(category_recs) - set(category_filter)) article_recs = list(set(article_recs) - set(article_filter)) # Unite them together... recommendations = [":Category:" + name for name in category_recs] + [name for name in article_recs] # And lop it off at 100! recommendations = recommendations[:100] # Class prediction predicted_class = self.qualitypredictor( [page for page in recommendations if page.startswith(":Category:") == False] ) + [(page, "Category") for page in recommendations if page.startswith(":Category:") == True] predicted_class = {pair[0]: pair[1] for pair in predicted_class} save_to = "User:Reports bot/" + wikiproject + "/Assessment/Not tagged" contents = ( "{{WPX list start|color={{{2|#086}}}<includeonly>|constrained=yes</includeonly>|title=Not tagged by the WikiProject|" "intro=These pages are potentially in the WikiProject's" " scope.<br />{{WPX last updated|" + save_to + "}}}}" "<br />\n\n" "{{#invoke:<includeonly>random|list|limit=3" "</includeonly><noinclude>list|unbulleted</noinclude>|" ) for recommendation in recommendations: contents += ( "{{WPX block|color={{{1|#37f}}}|largetext=<b>[[" + recommendation.replace("_", " ") + "]]</b> ([[Talk:" + recommendation + "|talk]])|smalltext=Predicted class: " + predicted_class[recommendation] + "}}|" ) contents = contents.replace("Talk::Category:", "Category talk:") contents = contents[:-1] + "}}\n{{WPX list end|more=" + save_to + "}}" page = pywikibot.Page(self.bot, save_to) page.text = contents page.save("Updating listing", minor=False, async=True)
def main(): # This is used for Aaron Halfaker's API wrapper... loginfile = configparser.ConfigParser() loginfile.read([os.path.expanduser('~/.wiki.ini')]) username = loginfile.get('wiki', 'username') password = loginfile.get('wiki', 'password') # ...And this is for Pywikibot bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() now = datetime.datetime.utcnow() now = now.strftime('%Y%m%d%H%M%S') # converts timestamp to MediaWiki format # Pulling timestamp of the last time the script was run query = wptools.query('index', 'select lu_timestamp from lastupdated where lu_key = "new_discussions";', None) lastupdated = query[0][0] # Polling for newest talk page posts in the last thirty minutes query = wptools.query('wiki', 'select distinct recentchanges.rc_this_oldid, page.page_id, recentchanges.rc_title, recentchanges.rc_comment, recentchanges.rc_timestamp, page.page_namespace from recentchanges join page on recentchanges.rc_namespace = page.page_namespace and recentchanges.rc_title = page.page_title join categorylinks on page.page_id=categorylinks.cl_from where rc_timestamp >= {0} and rc_timestamp < {1} and rc_comment like "% new section" and rc_deleted = 0 and cl_to like "%_articles" and page_namespace not in (0, 2, 6, 8, 10, 12, 14, 100, 108, 118) order by rc_timestamp asc;'.format(lastupdated, now), None) # Cleaning up output namespace = {1: 'Talk:', 3: 'User_talk:', 4: 'Wikipedia:', 5: 'Wikipedia_talk:', 7: 'File_talk:', 9: 'MediaWiki_talk:', 11: 'Template_talk:', 13: 'Help_talk:', 15: 'Category_talk:', 101: 'Portal_talk:', 109: 'Book_talk:', 119: 'Draft_talk:', 447: 'Education_Program_talk:', 711: 'TimedText_talk:', 829: 'Module_talk:', 2600: 'Topic:'} output = [] for row in query: rc_id = row[0] page_id = row[1] rc_title = row[2].decode('utf-8') rc_comment = row[3].decode('utf-8') rc_comment = rc_comment[3:] # Truncate beginning part of the edit summary rc_comment = rc_comment[:-15] # Truncate end of the edit summary rc_timestamp = row[4].decode('utf-8') rc_timestamp = datetime.datetime.strptime(rc_timestamp, '%Y%m%d%H%M%S') rc_timestamp = rc_timestamp.strftime('%H:%M, %d %B %Y (UTC)') page_namespace = row[5] page_namespace = namespace[page_namespace] session = api.Session("https://en.wikipedia.org/w/api.php", user_agent='WPX Revert Checker') session.login(username, password) # Check if revision has been reverted reverted = reverts.api.check(session, rc_id, page_id, 3, None, 172800, None) if reverted is None: entry = {'title': (page_namespace + rc_title), 'section': rc_comment, 'timestamp': rc_timestamp} output.append(entry) # Loading list of WikiProjects signed up to get lists of new discussions config = json.loads(wptools.query('index', 'select json from config;', None)[0][0]) if config['defaults']['new_discussions'] == False: # i.e. if New Discussions is an opt-in system whitelist = [] # Whitelisted WikiProjects for new discussion lists for project in config['projects']: try: project['new_discussions'] except KeyError: continue else: if project['new_discussions'] == True: whitelist.append(project['name']) else: whitelist = None # A whitelist of [] is one where there is a whitelist, but it's just empty. # A whitelist of None is for situations where the need for a whitelist has been obviated. # Generating list of WikiProjects for each thread for thread in output: query = wptools.query('index', 'select distinct pi_project from projectindex where pi_page = %s;', (thread['title'])) thread['wikiprojects'] = [] for row in query: wikiproject = row[0].replace('_', ' ') if (whitelist is None) or (wikiproject in whitelist): thread['wikiprojects'].append(wikiproject) for wikiproject in thread['wikiprojects']: saveto = wikiproject + '/Discussions' page = pywikibot.Page(bot, saveto) intro_garbage = '{{WPX header|Discussions|color={{{1|#37f}}}}}\n' intro_garbage += '{{{{WPX action box|color={{{{{{2|#086}}}}}}|title=Have a question?|content={{{{Clickable button 2|url=//en.wikipedia.org/wiki/Wikipedia_talk:{0}?action=edit§ion=new|Ask the WikiProject|class=mw-ui-progressive mw-ui-block}}}}\n\n{{{{Clickable button 2|Wikipedia talk:{0}|View Other Discussions|class=mw-ui-block}}}}}}}}\n'.format(wikiproject[10:].replace(' ', '_')) intro_garbage += '{{{{WPX list start|intro={{{{WPX last updated|{0}}}}}}}}}\n\n'.format(saveto) draft = '<noinclude><div style="padding-bottom:1em;">{{{{Clickable button 2|{0}|Return to WikiProject|class=mw-ui-neutral}}}}</div>\n</noinclude>'.format(wikiproject) + intro_garbage submission = '{{{{WPX new discussion|color={{{{{{1|#37f}}}}}}|title={0}|section={1}|timestamp={2}}}}}\n'.format(thread['title'].replace('_', ' '), thread['section'], thread['timestamp']) notification = "* '''[[{0}#{1}|{1}]] on {0}".format(thread['title'].replace('_', ' '), thread['section']) queue_notification(wikiproject[10:].replace(' ', '_'), notification) index = mwparserfromhell.parse(page.text) index = index.filter_templates() templatelist = [] for i in index: if i.name == "WPX new discussion": templatelist.append(str(i)) templatelist = templatelist[:14] # Sayonara, old threads! page.text = draft + submission if len(templatelist) > 3: templatelist[2] += "<noinclude>" # Anything after the third item will not be transcluded templatelist[len(templatelist) - 1] += "</noinclude>" for i in templatelist: page.text += i + "\n" page.text += "{{{{WPX list end|more={0}}}}}".format(saveto.replace(' ', '_')) page.save('New discussion on [[{0}]]'.format(thread['title'].replace('_', ' ')), minor=False) # Update the Last Updated field with new timestamp wptools.query('index', 'update lastupdated set lu_timestamp = {0} where lu_key = "new_discussions";'.format(now), None)
def main(rootpage): d = WikiProjectDirectory() wptools = WikiProjectTools() wpcats = WikiProjectCategories() tree = wpcats.generate() bot = pywikibot.Site('en', 'wikipedia') directories = {} directoryrow = {} projects = [] # Generate directoryrows and projects lists based on the /All directory: page = pywikibot.Page(bot, rootpage + '/All') contents = mwph.parse(page.text) contents = contents.filter_templates() for t in contents: if t.name.strip() == "WikiProject directory entry": name = str(t.get('project').value).strip().replace(' ', '_') projects.append(name) directoryrow[name] = str(t) + "\n" # The rest of this stuff is copied from directory.py index_primary = sorted([key for key in tree.keys()]) index_secondary = {} indextext = "'''[[{0}/All|All WikiProjects]]'''\n\n".format(rootpage) for firstlevel in tree.keys(): directories[firstlevel] = "={0}=\n".format(firstlevel.replace( '_', ' ')) directories[firstlevel] += d.listpull( wptools, projects, directoryrow, firstlevel) # For immmedate subcats of WikiProjects_by_area directories[firstlevel] += d.treeiterator( wptools, tree[firstlevel], projects, directoryrow, firstlevel) # For descendants of those immediate subcats. index_secondary[firstlevel] = sorted( [key for key in tree[firstlevel].keys()]) # Updating the directory index for firstlevel in index_primary: firstlevel_normalized = firstlevel.replace('_', ' ') indextext += ";[[{0}/{1}|{1}]]".format(rootpage, firstlevel_normalized) if len(tree[firstlevel]) > 0: indextext += " : " for secondlevel in index_secondary[firstlevel]: indextext += "[[{0}/{1}#{2}|{2}]] – ".format( rootpage, firstlevel_normalized, secondlevel.replace('_', ' ')) indextext = indextext[: -3] # Truncates trailing dash and is also a cute smiley face indextext += "\n\n" saveindex = pywikibot.Page(bot, 'Template:WikiProject directory index') saveindex.text = indextext saveindex.save('Updating', minor=False, async=True) # Generate directories and save! for directory in directories.keys(): contents = directories[directory] page = pywikibot.Page(bot, rootpage + "/" + directory) if contents != page.text: # Checking to see if a change was made to cut down on API save queries oldcontents = page.text page.text = contents page.save('Updating', minor=False, async=True)
class PriorityPredictor: def __init__(self, viewdump=None): print("Initializing the Priority Predictor") self.wptools = WikiProjectTools() if viewdump == None: # If a dumped JSON file of pageviews is not specified self.dump = getviewdump(self.wptools, 'en', days=30) else: with open(viewdump, 'r') as f: self.dump = json.load( f) # Load pageviews from a dumped JSON file def loadproject(self, wikiproject, unknownpriority): self.projectcat = unknownpriority.replace("Unknown-", "") self.project = wikiproject self.score = { } # Unsorted dictionary "article: value"; allows for easily looking up scores later # We need all the articles for a WikiProject, since the system works by comparing stats for an article to the others. print("Preparing Priority Predictor for: " + self.project) self.articles = [] # List of strings (article titles) pageviews = [] # List of tuples (article title, log of view count) linkcount = [] # List of tuples (article title, log of link count) for row in self.wptools.query( 'index', 'select pi_page from projectindex where pi_project = "Wikipedia:{0}";' .format(self.project), None): if row[0].startswith("Talk:"): # article = row[0][5:] # Stripping out "Talk:" self.articles.append(article) pageviews.append( (article, log(getpageviews(self.dump, article) + 1))) # Inbound link count # This is batched, thus broken out of the loop print("Getting inbound link count...") packages = [] for i in range(0, len(self.articles), 10000): packages.append(self.articles[i:i + 10000]) for package in packages: toappend = getlinkcount(self.wptools, package) for item in toappend: linkcount.append(item) # "Internal Clout" # This measures, within a group of articles, the number of links to each other # Works amazingly well as a metric print("Measuring internal clout...") internalclout = getinternalclout(self.wptools, self.articles, self.articles) # SOPV, Second-Order Page Views # Calculates the page views of the articles linking to the article being assessed print("Measuring second-order page views...") sopv = getsopv(self.wptools, self.dump, self.articles) # Sorting... pageviews = sorted(pageviews, key=operator.itemgetter(1), reverse=True) linkcount = sorted(linkcount, key=operator.itemgetter(1), reverse=True) internalclout = sorted(internalclout, key=operator.itemgetter(1), reverse=True) sopv = sorted(sopv, key=operator.itemgetter(1), reverse=True) # Converting to dictionary to weight factors and add them all together print("Prepared weighted scores...") pageviews_weighted = {} linkcount_weighted = {} internalclout_weighted = {} sopv_weighted = {} # Weights assigned to different factors. self.weight_pageviews = 1 self.weight_linkcount = 1 self.weight_internalclout = 1 self.weight_sopv = 1 for pair in pageviews: article = pair[0] count = pair[1] pageviews_weighted[article] = count * self.weight_pageviews for pair in linkcount: article = pair[0] count = pair[1] linkcount_weighted[article] = count * self.weight_linkcount for pair in internalclout: article = pair[0] count = pair[1] internalclout_weighted[article] = count * self.weight_internalclout for pair in sopv: article = pair[0] count = pair[1] sopv_weighted[article] = count * self.weight_sopv for article in self.articles: if article in internalclout_weighted and article in pageviews_weighted and article in linkcount_weighted and article in sopv_weighted: scorearray = [ internalclout_weighted[article], sopv_weighted[article] ] self.score[article] = scorearray # Multiclass classification print("Making calculations...") self.threshold = {} self.scorelist = {} q = 'select page_title from categorylinks join page on cl_from = page_id where cl_type = "page" and cl_to = "{0}";' for priority in ['Top-', 'High-', 'Mid-', 'Low-']: prioritycategory = priority + self.projectcat self.scorelist[priority] = [ (row[0].decode('utf-8'), self.score[row[0].decode('utf-8')]) for row in self.wptools.query('wiki', q.format( prioritycategory), None) if row[0].decode('utf-8') in self.score ] X = np.array([(x[1]) for x in self.scorelist['Top-']] + [(x[1]) for x in self.scorelist['High-']] + [(x[1]) for x in self.scorelist['Mid-']] + [(x[1]) for x in self.scorelist['Low-']]) y = np.array([0 for x in self.scorelist['Top-']] + [1 for x in self.scorelist['High-']] + [2 for x in self.scorelist['Mid-']] + [3 for x in self.scorelist['Low-']]) model = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y) print(list(model.predict(X))) print(model.score(X, y))
class WikiProjectAssess: def __init__(self): self.bot = pywikibot.Site('en', 'wikipedia') self.wptools = WikiProjectTools() self.projects = [] self.predictorseed = {} self.unknownquality = {} self.unknownpriority = {} self.config = self.wptools.query('index', 'select json from config;', None) self.config = json.loads(self.config[0][0]) for entry in self.config['projects']: if 'assessment_tools' in entry \ and 'at_category' in entry \ and 'at_unknown_quality' in entry \ and 'at_unknown_priority' in entry: project = entry['name'][10:] # Normalizing title self.projects.append(project) self.predictorseed[project] = entry['at_category'].replace(' ', '_') self.unknownquality[project] = entry['at_unknown_quality'].replace(' ', '_') self.unknownpriority[project] = entry['at_unknown_priority'].replace(' ', '_') def qualitypredictor(self, pagetitles): ''' Makes a query to ORES that predicts the quality of an article. Takes list *pagetitles* as input Returns a list of tuples (title, prediction) Input MUST be a list. If only one title, enter it as [title] ''' output = [] # Split into packages packages = [pagetitles[i:i+50] for i in range(0, len(pagetitles), 50)] for package in packages: if len(package) > 1: q = ('select page_title, page_latest from page ' 'where page_namespace = 0 and page_title in {0} ' 'order by page_title limit 100;').format(tuple(package)) else: q = ('select page_title, page_latest from page ' 'where page_namespace = 0 ' 'and page_title = "{0}";').format(package[0]) revision_ids = {str(row[1]):row[0].decode('utf-8') \ for row in self.wptools.query('wiki', q, None)} api_input = [rev_id for rev_id in revision_ids.keys()] api_url = "http://ores.wmflabs.org/scores/enwiki/wp10/?revids=" for rev_id in api_input: api_url += rev_id + "|" api_url = api_url[:-1] # Truncating extra vertical pipe query = requests.get(api_url) query = query.json() for rev_id, result in query.items(): pair = (revision_ids[rev_id], result['prediction']) output.append(pair) return output def qualitylist(self): for wikiproject, category in self.unknownquality.items(): save_to = "User:Reports bot/" + wikiproject + \ "/Assessment/Assess for quality" q = ('select page_title from categorylinks ' 'join page on cl_from = page_id ' 'where cl_to = "{0}";').format(category.replace(' ', '_')) to_process = [row[0].decode('utf-8') \ for row in self.wptools.query('wiki', q, None)] to_process = self.qualitypredictor(to_process) contents = ("{{WPX list start|color={{{2|#086}}}<includeonly>|constrained=yes</includeonly>|title=Assess for quality" "|intro=Determine the quality of these articles<br />" "{{WPX last updated|" + save_to + "}}}}<br />\n\n" "{{#invoke:<includeonly>random|list|limit=3" "</includeonly><noinclude>list|unbulleted</noinclude>|") for pair in to_process: article = pair[0].replace("_", " ") prediction = pair[1] contents += "{{WPX block|color={{{1|#37f}}}|largetext=<b>[[" + article + "]]</b> "\ + "([[Talk:" + article + "|talk]])|smalltext="\ + "Predicted class: " + prediction + "}}|" contents = contents[:-1] + "}}\n{{WPX list end|more=" + save_to \ + "}}" page = pywikibot.Page(self.bot, save_to) page.text = contents page.save("Updating listing", minor=False, async=True) def scopepredictor(self): for wikiproject, category in self.predictorseed.items(): category_recs = [] article_recs = [] # This query produces a list of pages that belong to categories that # have been tagged by the WikiProject q = ('select page_namespace, page_title from page ' 'join categorylinks on categorylinks.cl_from = page.page_id ' 'where page_namespace in (0, 14) ' 'and cl_to in ( ' 'select page.page_title from page ' 'join categorylinks on categorylinks.cl_from = page.page_id ' 'where page_namespace = 15 ' 'and cl_to = "{0}");').format(category) for row in self.wptools.query('wiki', q, None): ns = row[0] page = row[1].decode('utf-8') if ns == 0: article_recs.append(page) elif ns == 14: category_recs.append(page) # Filter against these lists: q = ('select pi_page from projectindex ' 'where pi_project = "Wikipedia:{0}";') q = q.format(wikiproject.replace(' ', '_')) article_filter = [row[0].replace('Talk:', '') \ for row in self.wptools.query('index', q, None) \ if row[0].startswith('Talk')] q = ('select page_title from page ' 'join categorylinks on cl_from = page_id ' 'where page_namespace = 15 ' 'and cl_to = "{0}";').format(category) category_filter = [row[0].decode('utf-8') \ for row in self.wptools.query('wiki', q, None)] # Now do the filtering... category_recs = list(set(category_recs) - set(category_filter)) article_recs = list(set(article_recs) - set(article_filter)) # Unite them together... recommendations = [':Category:' + name for name in category_recs] \ + [name for name in article_recs] # And lop it off at 100! recommendations = recommendations[:100] # Class prediction predicted_class = self.qualitypredictor([page for page in recommendations \ if page.startswith(':Category:') == False]) + \ [(page, 'Category') for page in recommendations \ if page.startswith(':Category:') == True] predicted_class = {pair[0]:pair[1] for pair in predicted_class} save_to = "User:Reports bot/" + wikiproject + "/Assessment/Not tagged" contents = ("{{WPX list start|color={{{2|#086}}}<includeonly>|constrained=yes</includeonly>|title=Not tagged by the WikiProject|" "intro=These pages are potentially in the WikiProject's" " scope.<br />{{WPX last updated|" + save_to + "}}}}" "<br />\n\n" "{{#invoke:<includeonly>random|list|limit=3" "</includeonly><noinclude>list|unbulleted</noinclude>|") for recommendation in recommendations: contents += "{{WPX block|color={{{1|#37f}}}|largetext=<b>[[" \ + recommendation.replace('_', ' ') \ + "]]</b> ([[Talk:" + recommendation \ + "|talk]])|smalltext=Predicted class: " \ + predicted_class[recommendation] + "}}|" contents = contents.replace("Talk::Category:", "Category talk:") contents = contents[:-1] + "}}\n{{WPX list end|more=" + save_to \ + "}}" page = pywikibot.Page(self.bot, save_to) page.text = contents page.save("Updating listing", minor=False, async=True)
def main(self, rootpage): # Initializing... bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() config = json.loads( wptools.query('index', 'select json from config;', None)[0][0]) # Get list of people who opted out optout = pywikibot.Page(bot, 'User:Reports bot/Opt-out') blacklist = [] regexes = [ re.findall('\[\[User:(.*?)\|', optout.text, re.I), re.findall('\{\{user\|(.*?)\}\}', optout.text, re.I), re.findall('\[\[:User:(.*?)\]', optout.text, re.I), re.findall('\[\[:User talk:(.*?)\]', optout.text, re.I) ] for results in regexes: for user in results: blacklist.append(user) # Bots are to be excluded for result in wptools.query( 'wiki', "select user_name from user_groups left join user on user_id = ug_user where ug_group = 'bot';", None): blacklist.append(result[0].decode('utf-8')) # List of projects we are working on # Methodology: List from Project Index + List from Formal Definition, minus duplicates # This will cover all of our bases. articles = {} counter = 0 while True: # I am a bad man for doing this query = wptools.query( 'index', 'select pi_page, pi_project from projectindex where pi_id > {0} and pi_id <= {1};' .format(counter, counter + 1000000), None) if len(query) == 0: break for pair in query: # Normalizing by getting rid of namespace page = pair[0] page = page.replace('Draft_talk:', '') page = page.replace('Talk:', '') proj = pair[1][ 10:] # Normalizing by getting rid of "Wikipedia:" try: articles[proj].append(page) except KeyError: articles[proj] = [page] counter += 1000000 projects = [project for project in articles.keys()] q = ('select distinct page.page_title from page ' 'join categorylinks on page.page_id = categorylinks.cl_from ' 'left join redirect on page.page_id = redirect.rd_from ' 'where page_namespace = 4 ' 'and page_title not like "%/%" ' 'and rd_title is null ' 'and (cl_to in ' '(select page.page_title from page ' 'where page_namespace = 14 and ' 'page_title like "%\_WikiProjects" ' 'and page_title not like "%\_for\_WikiProjects" ' 'and page_title not like "%\_of\_WikiProjects") ' 'or page_title like "WikiProject\_%");') formaldefinition = wptools.query( 'wiki', q, None) # http://quarry.wmflabs.org/query/3509 for row in formaldefinition: row = row[0].decode('utf-8') if row not in projects: projects.append(row) projects.sort() directories = { 'All': '' } # All projects, plus subdirectories to be defined below. directoryrow = {} # Alright! Let's run some reports! for project in projects: # Seeding directory row and profile page if project not in articles: articles[project] = [] project_normalized = project.replace('_', ' ') # List of active project participants (less blacklist) wp_editors = [] start_date = time.strftime( '%Y%m%d000000', time.gmtime(time.time() - (60 * 60 * 24 * 90))) # 90 days end_date = time.strftime('%Y%m%d000000', time.gmtime(time.time())) # Today query = "select rev_user_text from page left join revision on page_id = rev_page where (page_namespace = 4 OR page_namespace = 5) and (page_title like \"{0}/%%\" OR page_title = \"{0}\") and rev_timestamp > {1} and rev_timestamp < {2} group by rev_user_text HAVING count(*) > 1;".format( project, start_date, end_date) for result in wptools.query('wiki', query, None): if result[0] is not None: user = result[0].decode('utf-8') if user not in blacklist: wp_editors.append(user) wp_editors.sort() # List of active subject area editors (less blacklist) start_date = time.strftime( '%Y%m%d000000', time.gmtime(time.time() - (60 * 60 * 24 * 30))) # 30 days end_date = time.strftime('%Y%m%d000000', time.gmtime(time.time())) # Today if len(articles[project]) > 0: subject_editors = [] packages = [] for i in range(0, len(articles[project]), 10000): packages.append(articles[project][i:i + 10000]) counter = 0 for package in packages: counter += 1 if len(package) > 1: query_builder = 'select rev_user_text from page left join revision on page_id = rev_page where page_namespace in (0, 1, 118, 119) and page_title in {0} and rev_timestamp > {1} and rev_timestamp < {2} order by rev_user_text;'.format( tuple(package), start_date, end_date) else: query_builder = 'select rev_user_text from page left join revision on page_id = rev_page where page_namespace in (0, 1, 118, 119) and page_title = "{0}" and rev_timestamp > {1} and rev_timestamp < {2} order by rev_user_text;'.format( package[0], start_date, end_date) for result in wptools.query('wiki', query_builder, None): if result[0] is not None: subject_editors.append(result[0].decode('utf-8')) subject_editors = dict( Counter(subject_editors) ) # Convert the list to a dictionary with username as key and edit count as value subject_editors_filtered = [] for user in subject_editors.keys(): if user not in blacklist: if subject_editors[user] > 4: subject_editors_filtered.append(user) subject_editors = subject_editors_filtered # And now assigned back. subject_editors.sort() else: subject_editors = [] # Generate and Save Profile Page wp_editors_formatted = "" subject_editors_formatted = "" if len(wp_editors) > 0: for editor in wp_editors: wp_editors_formatted += "\n* [[User:{0}|{0}]] ([[User talk:{0}|talk]])".format( editor) else: wp_editors_formatted = "" if len(subject_editors) > 0 and len(subject_editors) < 3200: for editor in subject_editors: subject_editors_formatted += "\n* [[User:{0}|{0}]] ([[User talk:{0}|talk]])".format( editor) else: subject_editors_formatted = "" profilepage = "{{{{WikiProject description page | project = {0} | list_of_active_wikiproject_participants = {1} | list_of_active_subject_area_editors = {2}}}}}".format( project_normalized, wp_editors_formatted, subject_editors_formatted) page = pywikibot.Page( bot, rootpage + '/Description/' + project_normalized) if profilepage != page.text: # Checking to see if a change was made to cut down on API queries page.text = profilepage page.save('Updating', minor=False, async=True) # Construct directory entry directoryrow[ project] = "{{{{WikiProject directory entry | project = {0} | number_of_articles = {1} | wp_editors = {2} | scope_editors = {3}}}}}\n".format( project_normalized, len(articles[project]), len(wp_editors), len(subject_editors)) # Assign directory entry to relevant directory pages ("All entries" and relevant subdirectory pages) for entry in sorted( directoryrow.items(), key=operator.itemgetter(1)): # Sorting into alphabetical order directories['All'] += entry[1] directories['All'] = "{{WikiProject directory top}}\n" + directories[ 'All'] + "|}" wpcats = WikiProjectCategories() tree = wpcats.generate() index_primary = sorted([key for key in tree.keys()]) index_secondary = {} indextext = "'''[[{0}/All|All WikiProjects]]'''\n\n".format(rootpage) for firstlevel in tree.keys(): directories[firstlevel] = "={0}=\n".format( firstlevel.replace('_', ' ')) directories[firstlevel] += self.listpull( wptools, projects, directoryrow, firstlevel) # For immmedate subcats of WikiProjects_by_area directories[firstlevel] += self.treeiterator( wptools, tree[firstlevel], projects, directoryrow, firstlevel) # For descendants of those immediate subcats. index_secondary[firstlevel] = sorted( [key for key in tree[firstlevel].keys()]) # Updating the directory index for firstlevel in index_primary: firstlevel_normalized = firstlevel.replace('_', ' ') indextext += ";[[{0}/{1}|{1}]]".format(rootpage, firstlevel_normalized) if len(tree[firstlevel]) > 0: indextext += " : " for secondlevel in index_secondary[firstlevel]: indextext += "[[{0}/{1}#{2}|{2}]] – ".format( rootpage, firstlevel_normalized, secondlevel.replace('_', ' ')) indextext = indextext[: -3] # Truncates trailing dash and is also a cute smiley face indextext += "\n\n" saveindex = pywikibot.Page(bot, 'Template:WikiProject directory index') saveindex.text = indextext saveindex.save('Updating', minor=False, async=True) # Generate directories and save! for directory in directories.keys(): contents = directories[directory] page = pywikibot.Page(bot, rootpage + "/" + directory) if contents != page.text: # Checking to see if a change was made to cut down on API save queries oldcontents = page.text page.text = contents page.save('Updating', minor=False, async=True) # Cleanup of obsolete description pages and "Related WikiProjects" pages if directory == 'All': oldcontents = mwph.parse(oldcontents) oldcontents = oldcontents.filter_templates() oldprojectlist = [] for t in oldcontents: if t.name.strip() == "WikiProject directory entry": oldprojectlist.append(str(t.get('project').value)) for oldproject in oldprojectlist: oldproject = oldproject.strip().replace( ' ', '_') # Normalizing if oldproject not in projects: deletethis = pywikibot.Page( bot, rootpage + '/Description/' + oldproject) deletethis.text = "{{db-g6|rationale=A bot has automatically tagged this page as obsolete. This means that the WikiProject described on this page has been deleted or made into a redirect}}\n" deletethis.save('Nominating page for deletion', minor=False, async=True) deletethis = pywikibot.Page( bot, 'Wikipedia:Related WikiProjects/' + oldproject) if deletethis.text != "": deletethis.text = "{{db-g6|rationale=A bot has automatically tagged this page as obsolete. This means that the WikiProject described on this page has been deleted or made into a redirect}}\n" deletethis.save('Nominating page for deletion', minor=False, async=True)
class PriorityPredictor: def __init__(self, viewdump=None): print("Initializing the Priority Predictor") self.wptools = WikiProjectTools() if viewdump == None: # If a dumped JSON file of pageviews is not specified self.dump = getviewdump(self.wptools, 'en', days=30) else: with open(viewdump, 'r') as f: self.dump = json.load(f) # Load pageviews from a dumped JSON file def loadproject(self, wikiproject, unknownpriority): self.projectcat = unknownpriority.replace("Unknown-", "") self.project = wikiproject self.score = {} # Unsorted dictionary "article: value"; allows for easily looking up scores later # We need all the articles for a WikiProject, since the system works by comparing stats for an article to the others. print("Preparing Priority Predictor for: " + self.project) self.articles = [] # List of strings (article titles) pageviews = [] # List of tuples (article title, log of view count) linkcount = [] # List of tuples (article title, log of link count) for row in self.wptools.query('index', 'select pi_page from projectindex where pi_project = "Wikipedia:{0}";'.format(self.project), None): if row[0].startswith("Talk:"): # article = row[0][5:] # Stripping out "Talk:" self.articles.append(article) pageviews.append((article, log(getpageviews(self.dump, article) + 1))) # Inbound link count # This is batched, thus broken out of the loop print("Getting inbound link count...") packages = [] for i in range(0, len(self.articles), 10000): packages.append(self.articles[i:i+10000]) for package in packages: toappend = getlinkcount(self.wptools, package) for item in toappend: linkcount.append(item) # "Internal Clout" # This measures, within a group of articles, the number of links to each other # Works amazingly well as a metric print("Measuring internal clout...") internalclout = getinternalclout(self.wptools, self.articles, self.articles) # SOPV, Second-Order Page Views # Calculates the page views of the articles linking to the article being assessed print("Measuring second-order page views...") sopv = getsopv(self.wptools, self.dump, self.articles) # Sorting... pageviews = sorted(pageviews, key=operator.itemgetter(1), reverse=True) linkcount = sorted(linkcount, key=operator.itemgetter(1), reverse=True) internalclout = sorted(internalclout, key=operator.itemgetter(1), reverse=True) sopv = sorted(sopv, key=operator.itemgetter(1), reverse=True) # Converting to dictionary to weight factors and add them all together print("Prepared weighted scores...") pageviews_weighted = {} linkcount_weighted = {} internalclout_weighted = {} sopv_weighted = {} # Weights assigned to different factors. self.weight_pageviews = 1 self.weight_linkcount = 1 self.weight_internalclout = 1 self.weight_sopv = 1 for pair in pageviews: article = pair[0] count = pair[1] pageviews_weighted[article] = count * self.weight_pageviews for pair in linkcount: article = pair[0] count = pair[1] linkcount_weighted[article] = count * self.weight_linkcount for pair in internalclout: article = pair[0] count = pair[1] internalclout_weighted[article] = count * self.weight_internalclout for pair in sopv: article = pair[0] count = pair[1] sopv_weighted[article] = count * self.weight_sopv for article in self.articles: if article in internalclout_weighted and article in pageviews_weighted and article in linkcount_weighted and article in sopv_weighted: scorearray = [internalclout_weighted[article], sopv_weighted[article]] self.score[article] = scorearray # Multiclass classification print("Making calculations...") self.threshold = {} self.scorelist = {} q = 'select page_title from categorylinks join page on cl_from = page_id where cl_type = "page" and cl_to = "{0}";' for priority in ['Top-', 'High-', 'Mid-', 'Low-']: prioritycategory = priority + self.projectcat self.scorelist[priority] = [(row[0].decode('utf-8'), self.score[row[0].decode('utf-8')]) for row in self.wptools.query('wiki', q.format(prioritycategory), None) if row[0].decode('utf-8') in self.score] X = np.array([(x[1]) for x in self.scorelist['Top-']] + [(x[1]) for x in self.scorelist['High-']] + [(x[1]) for x in self.scorelist['Mid-']] + [(x[1]) for x in self.scorelist['Low-']]) y = np.array([0 for x in self.scorelist['Top-']] + [1 for x in self.scorelist['High-']] + [2 for x in self.scorelist['Mid-']] + [3 for x in self.scorelist['Low-']]) model = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y) print(list(model.predict(X))) print(model.score(X, y))
class WikiProjectNotifications: def __init__(self): self.wptools = WikiProjectTools() q = ('create table if not exists notifications ' '(n_id int(11) NOT NULL auto_increment, ' 'n_project VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_variant VARCHAR(255) character set utf8 collate utf8_unicode_ci, ' 'n_content TEXT character set utf8 collate utf8_unicode_ci, ' 'primary key (n_id)) ' 'engine=innodb character set=utf8;') #self.wptools.query('index', q, None) self.bot = pywikibot.Site('en', 'wikipedia', user='******') # Recognized notification variants # A variant that is not any of these kinds will cause an error # variantname --> template parameter name date = datetime.datetime.utcnow().strftime('%d %B %Y') self.contentwrapper = '<div style="max-width:500px; padding-bottom:2.5em;">' self.recognizedvariants = {'newmember': \ 'notification_when_a_new_member_joins', \ 'newdiscussion': \ 'notification_when_a_new_discussion_topic_is_posted'} self.varianttext = {'newmember': \ '==New member report for ' + date + '==\n' + self.contentwrapper + 'The following users joined the WikiProject in the past day:\n', \ 'newdiscussion': \ '==New discussion report for ' + date + '==\n' + self.contentwrapper + 'New discussions that are of interest to the WikiProject:\n'} def active_user(self, username): ''' Determines if a username meets a basic threshold of activity Takes string *username*, returns boolean Threshold is one edit in the recent changes tables (i.e. in the past 30 days) ''' q = 'select count(*) from recentchanges_userindex where rc_user_text = "{0}"'.format(username.replace('_', ' ')) if self.wptools.query('wiki', q, None)[0][0] > 0: return True else: return False def post(self, project, variant, content): ''' Adds an item to the WikiProject Notification Center, to be included in the next update ''' if variant in self.recognizedvariants: q = 'insert into notifications (n_project, n_variant, n_content) values ("{0}", "{1}", "{2}");' q = q.format(project, variant, content) self.wptools.query('index', q, None) else: raise NotificationVariantError(variant) def findsubscribers(self): ''' Generates a dictionary of WikiProjects with notification centers and corresponding report subscribers ''' q = ('select page_title from templatelinks ' 'join page on page_id = tl_from and page_namespace = tl_from_namespace ' 'where page_namespace = 2 and tl_namespace = 10 ' 'and tl_title = "WikiProjectCard";') output = {} for row in self.wptools.query('wiki', q, None): title = row[0].decode('utf-8') components = title.split('/') # e.g. ['Harej', 'WikiProjectCards', 'WikiProject_Women_in_Technology'] username = components[0] # No notifications for inactive users if self.active_user(username) == False: continue # Carrying on... title = "User: "******"New notification", minor=False, async=True, quiet=True) # Deleting old records now that notifications have been sent out if len(id_to_delete) > 0: if len(id_to_delete) == 1: self.wptools.query('index', 'delete from notifications where n_id = {0};'.format(id_to_delete[0]), None) else: self.wptools.query('index', 'delete from notifications where n_id in {0};'.format(tuple(id_to_delete)), None)
def __init__(self): self.wptools = WikiProjectTools() self.bot = pywikibot.Site('en', 'wikipedia')
def main(): # This is used for Aaron Halfaker's API wrapper... loginfile = configparser.ConfigParser() loginfile.read([os.path.expanduser('~/.wiki.ini')]) username = loginfile.get('wiki', 'username') password = loginfile.get('wiki', 'password') # ...And this is for Pywikibot bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() now = datetime.datetime.utcnow() now = now.strftime( '%Y%m%d%H%M%S') # converts timestamp to MediaWiki format # Pulling timestamp of the last time the script was run query = wptools.query( 'index', 'select lu_timestamp from lastupdated where lu_key = "new_discussions";', None) lastupdated = query[0][0] # Polling for newest talk page posts in the last thirty minutes query = wptools.query( 'wiki', 'select distinct recentchanges.rc_this_oldid, page.page_id, recentchanges.rc_title, recentchanges.rc_comment, recentchanges.rc_timestamp, page.page_namespace from recentchanges join page on recentchanges.rc_namespace = page.page_namespace and recentchanges.rc_title = page.page_title join categorylinks on page.page_id=categorylinks.cl_from where rc_timestamp >= {0} and rc_timestamp < {1} and rc_comment like "% new section" and rc_deleted = 0 and cl_to like "%_articles" and page_namespace not in (0, 2, 6, 8, 10, 12, 14, 100, 108, 118) order by rc_timestamp asc;' .format(lastupdated, now), None) # Cleaning up output namespace = { 1: 'Talk:', 3: 'User_talk:', 4: 'Wikipedia:', 5: 'Wikipedia_talk:', 7: 'File_talk:', 9: 'MediaWiki_talk:', 11: 'Template_talk:', 13: 'Help_talk:', 15: 'Category_talk:', 101: 'Portal_talk:', 109: 'Book_talk:', 119: 'Draft_talk:', 447: 'Education_Program_talk:', 711: 'TimedText_talk:', 829: 'Module_talk:', 2600: 'Topic:' } output = [] for row in query: rc_id = row[0] page_id = row[1] rc_title = row[2].decode('utf-8') rc_comment = row[3].decode('utf-8') rc_comment = rc_comment[ 3:] # Truncate beginning part of the edit summary rc_comment = rc_comment[:-15] # Truncate end of the edit summary rc_timestamp = row[4].decode('utf-8') rc_timestamp = datetime.datetime.strptime(rc_timestamp, '%Y%m%d%H%M%S') rc_timestamp = rc_timestamp.strftime('%H:%M, %d %B %Y (UTC)') page_namespace = row[5] page_namespace = namespace[page_namespace] session = api.Session("https://en.wikipedia.org/w/api.php", user_agent='WPX Revert Checker') session.login(username, password) # Check if revision has been reverted reverted = reverts.api.check(session, rc_id, page_id, 3, None, 172800, None) if reverted is None: entry = { 'title': (page_namespace + rc_title), 'section': rc_comment, 'timestamp': rc_timestamp } output.append(entry) # Loading list of WikiProjects signed up to get lists of new discussions config = json.loads( wptools.query('index', 'select json from config;', None)[0][0]) if config['defaults'][ 'new_discussions'] == False: # i.e. if New Discussions is an opt-in system whitelist = [] # Whitelisted WikiProjects for new discussion lists for project in config['projects']: try: project['new_discussions'] except KeyError: continue else: if project['new_discussions'] == True: whitelist.append(project['name']) else: whitelist = None # A whitelist of [] is one where there is a whitelist, but it's just empty. # A whitelist of None is for situations where the need for a whitelist has been obviated. # Generating list of WikiProjects for each thread for thread in output: query = wptools.query( 'index', 'select distinct pi_project from projectindex where pi_page = %s;', (thread['title'])) thread['wikiprojects'] = [] for row in query: wikiproject = row[0].replace('_', ' ') if (whitelist is None) or (wikiproject in whitelist): thread['wikiprojects'].append(wikiproject) for wikiproject in thread['wikiprojects']: saveto = wikiproject + '/Discussions' page = pywikibot.Page(bot, saveto) intro_garbage = '{{WPX header|Discussions|color={{{1|#37f}}}}}\n' intro_garbage += '{{{{WPX action box|color={{{{{{2|#086}}}}}}|title=Have a question?|content={{{{Clickable button 2|url=//en.wikipedia.org/wiki/Wikipedia_talk:{0}?action=edit§ion=new|Ask the WikiProject|class=mw-ui-progressive mw-ui-block}}}}\n\n{{{{Clickable button 2|Wikipedia talk:{0}|View Other Discussions|class=mw-ui-block}}}}}}}}\n'.format( wikiproject[10:].replace(' ', '_')) intro_garbage += '{{{{WPX list start|intro={{{{WPX last updated|{0}}}}}}}}}\n\n'.format( saveto) draft = '<noinclude><div style="padding-bottom:1em;">{{{{Clickable button 2|{0}|Return to WikiProject|class=mw-ui-neutral}}}}</div>\n</noinclude>'.format( wikiproject) + intro_garbage submission = '{{{{WPX new discussion|color={{{{{{1|#37f}}}}}}|title={0}|section={1}|timestamp={2}}}}}\n'.format( thread['title'].replace('_', ' '), thread['section'], thread['timestamp']) notification = "* '''[[{0}#{1}|{1}]] on {0}".format( thread['title'].replace('_', ' '), thread['section']) queue_notification(wikiproject[10:].replace(' ', '_'), notification) index = mwparserfromhell.parse(page.text) index = index.filter_templates() templatelist = [] for i in index: if i.name == "WPX new discussion": templatelist.append(str(i)) templatelist = templatelist[:14] # Sayonara, old threads! page.text = draft + submission if len(templatelist) > 3: templatelist[ 2] += "<noinclude>" # Anything after the third item will not be transcluded templatelist[len(templatelist) - 1] += "</noinclude>" for i in templatelist: page.text += i + "\n" page.text += "{{{{WPX list end|more={0}}}}}".format( saveto.replace(' ', '_')) page.save('New discussion on [[{0}]]'.format( thread['title'].replace('_', ' ')), minor=False) # Update the Last Updated field with new timestamp wptools.query( 'index', 'update lastupdated set lu_timestamp = {0} where lu_key = "new_discussions";' .format(now), None)