def parse(page): res=[] empty=True for rev in page: empty=False pass if empty: return res res.append(page.title) res.append(wikiclass.score(scorer_model, rev.text)['prediction']) return [res]
def process_drafts(self, drafts): ''' Go through the edit history of the given drafts, identify their AfC submission history, and get predictions for relevant revisions. ''' ## current time, to be used in the revision query if necessary current_timestamp = dt.datetime.now(dt.timezone.utc) ## query to get all revisions for a live page from the revision table rev_live_query = '''SELECT rev_id, rev_timestamp FROM enwiki.revision WHERE rev_page = %(page_id)s AND rev_timestamp >= %(start_timestamp)s AND rev_timestamp < %(end_timestamp)s ORDER BY rev_timestamp''' ## query to get all revisions for a deleted page from the archive table rev_archive_query = '''SELECT ar_rev_id AS rev_id, ar_timestamp AS rev_timestamp FROM enwiki.archive WHERE ar_page_id = %(page_id)s AND ar_timestamp >= %(start_timestamp)s AND ar_timestamp < %(end_timestamp)s ORDER BY ar_timestamp''' for draft in drafts: ## get revisions between creation time and either publication, ## deletion, or current time depending on whether the page ## was deleted, published, or neither status = None end_timestamp = current_timestamp if draft.publication_timestamp: if draft.deletion_timestamp \ and draft.deletion_timestamp < draft.publication_timestamp: status = 'deleted' end_timestamp = draft.deletion_timestamp else: status = 'published' end_timestamp = draft.publication_timestamp elif draft.deletion_timestamp: status = 'deleted' end_timestamp = draft.deletion_timestamp revisions = [] with db.cursor(self.db_conn, 'dict') as db_cursor: ## Default query for a live page, switch to archive if deleted. db_query = rev_live_query if status == 'deleted': db_query = rev_archive_query db_cursor.execute( db_query, {'page_id': draft.page_id, 'start_timestamp': draft.creation_timestamp.strftime('%Y%m%d%H%M%S'), 'end_timestamp': end_timestamp.strftime('%Y%m%d%H%M%S')} ) for row in db_cursor: revisions.append(Revision(row['rev_id'], row['rev_timestamp'].decode('utf-8'))) # grab content for those revisions, # store in a map from revid to content revision_map = {} for revision in self.get_rev_content(revisions, status == 'deleted'): rev_id = revision['revid'] content = revision['content'] revision_map[rev_id] = content prev_rev_content = '' # content of the previous revision, for diff ## set of AfC submissions present in the previous revision, so that ## we can identify withdrawn (deleted) submissions. previous_submissions = set() # go through the edit history in chronological order for revision in revisions: try: parsed_content = mwp.parse(revision_map[revision.id]) except KeyError: ## have no content for this revision continue except mwp.parser.ParserError: ## parsing failed, unable to process this revision continue ## AfC submissions in this revision, which we'll diff against ## the previous one to identify withdrawn submissions current_submissions = set() templates = parsed_content.filter_templates() for template in templates: if template.name.lower() != 'afc submission' \ and not template.name.matches('Articles for creation'): continue try: status = template.params[0].value except IndexError: status = '' try: comment = template.params[1].value except IndexError: comment = '' try: username = str(template.get('u').value) except ValueError: username = '' try: namespace = int(str(template.get('ns').value)) except ValueError: namespace = 118 try: timestamp = dt.datetime.strptime( str(template.get('ts').value), '%Y%m%d%H%M%S') except ValueError: timestamp = revision.timestamp if status.upper() == 'D': try: decliner = str(template.get('decliner').value) except ValueError: decliner = '' try: decline_timestamp = dt.datetime.strptime( str(template.get('declinets').value), '%Y%m%d%H%M%S') except ValueError: decline_timestamp = revision.timestamp ## Is this an AfC submitted for review? afc_submitted = False if status == '': afc_submitted = True ## Is this a new entry or an existing one? if timestamp in draft.afc_history: afc = draft.afc_history[timestamp] if afc.status != 'D' and status.upper() == 'D': ## AfC was declined afc.status = 'D' afc.decliner = decliner afc.decline_timestamp = decline_timestamp elif afc.status == '' and status == '': afc_submitted = False # submitted previously elif afc.status == 'T' and status == '': afc.status = '' # submitted now else: afc = AfCSubmission(draft.page_id, timestamp, status.upper(), revision.id, None) draft.afc_history[timestamp] = afc ## We have a submission, add it to the current set current_submissions.add(afc) if afc_submitted: draft_results = wikiclass.score(self.draft_model, content) wp10_results = wikiclass.score(self.wp10_model, content) # store stuff afc.prediction = Prediction(revision.id, revision.timestamp) afc.prediction.draft_pred = draft_results['prediction'] afc.prediction.ok_prob = draft_results['probability']['OK'] afc.prediction.attack_prob = draft_results['probability']['attack'] afc.prediction.vandal_prob = draft_results['probability']['vandalism'] afc.prediction.spam_prob = draft_results['probability']['spam'] afc.prediction.wp10_pred = wp10_results['prediction'] afc.prediction.stub_prob = wp10_results['probability']['Stub'] afc.prediction.start_prob = wp10_results['probability']['Start'] afc.prediction.c_prob = wp10_results['probability']['C'] afc.prediction.b_prob = wp10_results['probability']['B'] afc.prediction.ga_prob = wp10_results['probability']['GA'] afc.prediction.fa_prob = wp10_results['probability']['FA'] ## We've completed checking all templates. Compare the current ## set with the previous set and set ones that have been ## submitted but been deleted to withdrawn ('W') submission_diff = previous_submissions - current_submissions for afc in submission_diff: if afc.status == '': afc.status = 'W' afc.withdraw_timestamp = revision.timestamp ## Now we can iterate: previous_submissions = current_submissions # ok, done return()
def process_queues(self, revlists): ''' Process the lists of revisions by grabbing revision content through the appropriate API calls, getting predictions based on the content, then updating the database with those predictions. :param revlists: RevisionLists named tuple with lists of archived and live revisions that we will process. :type revlists: namedtuple ''' ## Empty prediction result for draftquality, inserted if we didn't ## get a prediciton back: draft_res_dummy = {'prediction': '', 'probability': {'spam': 0.0, 'vandalism': 0.0, 'attack': 0.0, 'OK': 0.0}} ## Empty prediction result for wp10 wp10_res_dummy = {'prediction': '', 'probability': {'FA': 0.0, 'Start': 0.0, 'B': 0.0, 'Stub': 0.0, 'C': 0.0, 'GA': 0.0}} if not self.api_session: self.start_api_session() if not self.ores_api_session: self.start_ores_session() i = 0 with db.cursor(self.db_conn) as db_cursor: for revision in self.get_rev_content(revlists.deleted, deleted=True): rev_id = revision['revid'] rev_timestamp = dt.datetime.strptime(revision['timestamp'], "%Y-%m-%dT%H:%M:%SZ") content = revision['content'] draft_results = wikiclass.score(self.draft_model, content) wp10_results = wikiclass.score(self.wp10_model, content) self.insert_results(db_cursor, rev_id, rev_timestamp, draft_results, wp10_results) i += 1 if i % 1000 == 0: logging.info('inserted {} data predictions'.format(i)) self.db_conn.commit() ## It's more efficient to use ORES for the live revisions. for revision, revision_pred in zip( revlists.live, self.ores_api_session.score( self.ores_context, self.ores_models, [r.id for r in revlists.live])): if not 'score' in revision_pred['draftquality'] \ and not 'score' in revision_pred['wp10']: ## No data available, skip this revision continue elif not 'score' in revision_pred['draftquality']: revision_pred['draftquality']['score'] = draft_res_dummy elif not 'score' in revision_pred['wp10']: revision_pred['wp10']['score'] = wp10_res_dummy self.insert_results(db_cursor, revision.id, revision.timestamp, revision_pred['draftquality']['score'], revision_pred['wp10']['score']) i += 1 if i % 1000 == 0: logging.info('inserted {} data predictions'.format(i)) self.db_conn.commit() logging.info('done inserting predictions, {} in total'.format(i)) self.db_conn.commit() # ok, done return()
from pprint import pprint import wikiclass from revscoring import Model scorer_model = Model.load( open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb')) # Classifies a revision of an article based on wikitext alone text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..." prediction_results = wikiclass.score(scorer_model, text) # Print predicted assessment class and probabilities for all classes. pprint(("assessment", prediction_results['prediction'])) pprint(("probs", prediction_results['probability']))
import wikiclass from revscoring.scorer_models import MLScorerModel import requests from collections import Counter import time import os model = MLScorerModel.load( open("/home/west1/github/wikiclass/models/enwiki.wp10.rf.model", "rb")) datadir = os.environ[ 'HOME'] + '/wikimedia/trunk/hoaxes/data/all_relevant_article_creation_content/' print('\t'.join(['title', 'Stub', 'B', 'C', 'FA', 'Start', 'GA'])) for f in os.listdir(datadir): if f.endswith(".txt"): with open(datadir + f, 'r') as markup_file: markup = markup_file.read() obj = wikiclass.score(model, markup) print('\t'.join([ f, str(obj['probability']['Stub']), str(obj['probability']['B']), str(obj['probability']['C']), str(obj['probability']['FA']), str(obj['probability']['Start']), str(obj['probability']['GA']) ])) else: continue
from pprint import pprint import wikiclass from revscoring import Model scorer_model = Model.load(open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb')) # Classifies a revision of an article based on wikitext alone text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..." prediction_results = wikiclass.score(scorer_model, text) # Print predicted assessment class and probabilities for all classes. pprint(("assessment", prediction_results['prediction'])) pprint(("probs", prediction_results['probability']))
#!/home/ellery/miniconda3/bin/python import wikiclass from revscoring.scorer_models import MLScorerModel import requests from collections import Counter import time import os model = MLScorerModel.load(open( "/home/west1/github/wikiclass/models/enwiki.wp10.rf.model", "rb")) datadir = os.environ['HOME'] + '/wikimedia/trunk/hoaxes/data/all_relevant_article_creation_content/' print('\t'.join(['title', 'Stub', 'B', 'C', 'FA', 'Start', 'GA'])) for f in os.listdir(datadir): if f.endswith(".txt"): with open(datadir + f, 'r') as markup_file: markup = markup_file.read() obj = wikiclass.score(model, markup) print('\t'.join([f, str(obj['probability']['Stub']), str(obj['probability']['B']), str(obj['probability']['C']), str(obj['probability']['FA']), str(obj['probability']['Start']), str(obj['probability']['GA'])])) else: continue