예제 #1
0
def parse(page):
    res=[]
    empty=True
    for rev in page:
        empty=False
        pass
    if empty:
        return res
    res.append(page.title)
    res.append(wikiclass.score(scorer_model, rev.text)['prediction'])
    return [res]
예제 #2
0
    def process_drafts(self, drafts):
        '''
        Go through the edit history of the given drafts, identify their
        AfC submission history, and get predictions for relevant revisions.
        '''

        ## current time, to be used in the revision query if necessary
        current_timestamp = dt.datetime.now(dt.timezone.utc)

        ## query to get all revisions for a live page from the revision table
        rev_live_query = '''SELECT rev_id, rev_timestamp
                            FROM enwiki.revision
                            WHERE rev_page = %(page_id)s
                            AND rev_timestamp >= %(start_timestamp)s
                            AND rev_timestamp < %(end_timestamp)s
                            ORDER BY rev_timestamp'''

        ## query to get all revisions for a deleted page from the archive table
        rev_archive_query = '''SELECT ar_rev_id AS rev_id,
                                      ar_timestamp AS rev_timestamp
                               FROM enwiki.archive
                               WHERE ar_page_id = %(page_id)s
                               AND ar_timestamp >= %(start_timestamp)s
                               AND ar_timestamp < %(end_timestamp)s
                               ORDER BY ar_timestamp'''
        
        for draft in drafts:
            ## get revisions between creation time and either publication,
            ## deletion, or current time depending on whether the page
            ## was deleted, published, or neither

            status = None
            end_timestamp = current_timestamp
            
            if draft.publication_timestamp:
                if draft.deletion_timestamp \
                   and draft.deletion_timestamp < draft.publication_timestamp:
                   status = 'deleted'
                   end_timestamp = draft.deletion_timestamp
                else:
                    status = 'published'
                    end_timestamp = draft.publication_timestamp
            elif draft.deletion_timestamp:
                status = 'deleted'
                end_timestamp = draft.deletion_timestamp

            revisions = []
            
            with db.cursor(self.db_conn, 'dict') as db_cursor:
                ## Default query for a live page, switch to archive if deleted.
                db_query = rev_live_query
                if status == 'deleted':
                    db_query = rev_archive_query

                db_cursor.execute(
                    db_query,
                    {'page_id': draft.page_id,
                         'start_timestamp': draft.creation_timestamp.strftime('%Y%m%d%H%M%S'),
                         'end_timestamp': end_timestamp.strftime('%Y%m%d%H%M%S')}
                )
                for row in db_cursor:
                    revisions.append(Revision(row['rev_id'],
                                              row['rev_timestamp'].decode('utf-8')))
            
            # grab content for those revisions,
            # store in a map from revid to content
            revision_map = {}
            for revision in self.get_rev_content(revisions,
                                                 status == 'deleted'):
                rev_id = revision['revid']
                content = revision['content']

                revision_map[rev_id] = content

            prev_rev_content = '' # content of the previous revision, for diff

            ## set of AfC submissions present in the previous revision, so that
            ## we can identify withdrawn (deleted) submissions.
            previous_submissions = set()
            
            # go through the edit history in chronological order
            for revision in revisions:
                try:
                    parsed_content = mwp.parse(revision_map[revision.id])
                except KeyError:
                    ## have no content for this revision
                    continue
                except mwp.parser.ParserError:
                    ## parsing failed, unable to process this revision
                    continue

                ## AfC submissions in this revision, which we'll diff against
                ## the previous one to identify withdrawn submissions
                current_submissions = set()
                
                templates = parsed_content.filter_templates()
                for template in templates:
                    if template.name.lower() != 'afc submission' \
                       and not template.name.matches('Articles for creation'):
                        continue

                    try: 
                        status = template.params[0].value
                    except IndexError:
                        status = ''

                    try:
                        comment = template.params[1].value
                    except IndexError:
                        comment = ''

                    try:
                        username = str(template.get('u').value)
                    except ValueError:
                        username = ''

                    try:
                        namespace = int(str(template.get('ns').value))
                    except ValueError:
                        namespace = 118

                    try:
                        timestamp = dt.datetime.strptime(
                            str(template.get('ts').value), '%Y%m%d%H%M%S')
                    except ValueError:
                        timestamp = revision.timestamp

                    if status.upper() == 'D':
                        try:
                            decliner = str(template.get('decliner').value)
                        except ValueError:
                            decliner = ''
                        try:
                            decline_timestamp = dt.datetime.strptime(
                                str(template.get('declinets').value),
                                '%Y%m%d%H%M%S')
                        except ValueError:
                            decline_timestamp = revision.timestamp

                    ## Is this an AfC submitted for review?
                    afc_submitted = False
                    if status == '':
                        afc_submitted = True

                    ## Is this a new entry or an existing one?
                    if timestamp in draft.afc_history:
                        afc = draft.afc_history[timestamp]

                        if afc.status != 'D' and status.upper() == 'D':
                            ## AfC was declined
                            afc.status = 'D'
                            afc.decliner = decliner
                            afc.decline_timestamp = decline_timestamp
                        elif afc.status == '' and status == '':
                            afc_submitted = False # submitted previously
                        elif afc.status == 'T' and status == '':
                            afc.status = '' # submitted now
                    else:
                       afc = AfCSubmission(draft.page_id, timestamp,
                                           status.upper(),
                                           revision.id, None)
                       draft.afc_history[timestamp] = afc

                    ## We have a submission, add it to the current set
                    current_submissions.add(afc)
                       
                    if afc_submitted:
                        draft_results = wikiclass.score(self.draft_model, content)
                        wp10_results = wikiclass.score(self.wp10_model, content)

                        # store stuff
                        afc.prediction = Prediction(revision.id,
                                                    revision.timestamp)
                        afc.prediction.draft_pred = draft_results['prediction']
                        afc.prediction.ok_prob = draft_results['probability']['OK']
                        afc.prediction.attack_prob = draft_results['probability']['attack']
                        afc.prediction.vandal_prob = draft_results['probability']['vandalism']
                        afc.prediction.spam_prob = draft_results['probability']['spam']
                        
                        afc.prediction.wp10_pred = wp10_results['prediction']
                        afc.prediction.stub_prob = wp10_results['probability']['Stub']
                        afc.prediction.start_prob = wp10_results['probability']['Start']
                        afc.prediction.c_prob = wp10_results['probability']['C']
                        afc.prediction.b_prob = wp10_results['probability']['B']
                        afc.prediction.ga_prob = wp10_results['probability']['GA']
                        afc.prediction.fa_prob = wp10_results['probability']['FA']

                ## We've completed checking all templates. Compare the current
                ## set with the previous set and set ones that have been
                ## submitted but been deleted to withdrawn ('W')
                submission_diff = previous_submissions - current_submissions
                for afc in submission_diff:
                    if afc.status == '':
                        afc.status = 'W'
                        afc.withdraw_timestamp = revision.timestamp

                ## Now we can iterate:
                previous_submissions = current_submissions

        # ok, done
        return()
예제 #3
0
    def process_queues(self, revlists):
        '''
        Process the lists of revisions by grabbing revision content through
        the appropriate API calls, getting predictions based on the content,
        then updating the database with those predictions.

        :param revlists: RevisionLists named tuple with lists of archived
                         and live revisions that we will process.
        :type revlists: namedtuple
        '''

        ## Empty prediction result for draftquality, inserted if we didn't
        ## get a prediciton back:
        draft_res_dummy = {'prediction': '',
                           'probability': {'spam': 0.0,
                                           'vandalism': 0.0,
                                           'attack': 0.0,
                                           'OK': 0.0}}

        ## Empty prediction result for wp10
        wp10_res_dummy = {'prediction': '',
                          'probability': {'FA': 0.0,
                                          'Start': 0.0,
                                          'B': 0.0,
                                          'Stub': 0.0,
                                          'C': 0.0,
                                          'GA': 0.0}}
        
        if not self.api_session:
            self.start_api_session()

        if not self.ores_api_session:
            self.start_ores_session()

        i = 0
        with db.cursor(self.db_conn) as db_cursor:
            for revision in self.get_rev_content(revlists.deleted,
                                                 deleted=True):
                rev_id = revision['revid']
                rev_timestamp = dt.datetime.strptime(revision['timestamp'],
                                                     "%Y-%m-%dT%H:%M:%SZ")
                content = revision['content']

                draft_results = wikiclass.score(self.draft_model, content)
                wp10_results = wikiclass.score(self.wp10_model, content)

                self.insert_results(db_cursor, rev_id, rev_timestamp,
                                    draft_results, wp10_results)

                i += 1
                if i % 1000 == 0:
                    logging.info('inserted {} data predictions'.format(i))
                    self.db_conn.commit()

            ## It's more efficient to use ORES for the live revisions.
            for revision, revision_pred in zip(
                    revlists.live, self.ores_api_session.score(
                        self.ores_context, self.ores_models,
                        [r.id for r in revlists.live])):
                if not 'score' in revision_pred['draftquality'] \
                   and not 'score' in revision_pred['wp10']:
                    ## No data available, skip this revision
                    continue
                elif not 'score' in revision_pred['draftquality']:
                    revision_pred['draftquality']['score'] = draft_res_dummy
                elif not 'score' in revision_pred['wp10']:
                    revision_pred['wp10']['score'] = wp10_res_dummy
                
                self.insert_results(db_cursor, revision.id, revision.timestamp,
                                    revision_pred['draftquality']['score'],
                                    revision_pred['wp10']['score'])
                i += 1
                if i % 1000 == 0:
                    logging.info('inserted {} data predictions'.format(i))
                    self.db_conn.commit()
        
        logging.info('done inserting predictions, {} in total'.format(i))
        self.db_conn.commit()
                    
        # ok, done
        return()
예제 #4
0
from pprint import pprint

import wikiclass
from revscoring import Model

scorer_model = Model.load(
    open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model',
         'rb'))

# Classifies a revision of an article based on wikitext alone
text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..."
prediction_results = wikiclass.score(scorer_model, text)

# Print predicted assessment class and probabilities for all classes.
pprint(("assessment", prediction_results['prediction']))
pprint(("probs", prediction_results['probability']))
예제 #5
0
import wikiclass
from revscoring.scorer_models import MLScorerModel
import requests
from collections import Counter
import time
import os

model = MLScorerModel.load(
    open("/home/west1/github/wikiclass/models/enwiki.wp10.rf.model", "rb"))

datadir = os.environ[
    'HOME'] + '/wikimedia/trunk/hoaxes/data/all_relevant_article_creation_content/'

print('\t'.join(['title', 'Stub', 'B', 'C', 'FA', 'Start', 'GA']))
for f in os.listdir(datadir):
    if f.endswith(".txt"):
        with open(datadir + f, 'r') as markup_file:
            markup = markup_file.read()
            obj = wikiclass.score(model, markup)
            print('\t'.join([
                f,
                str(obj['probability']['Stub']),
                str(obj['probability']['B']),
                str(obj['probability']['C']),
                str(obj['probability']['FA']),
                str(obj['probability']['Start']),
                str(obj['probability']['GA'])
            ]))
    else:
        continue
예제 #6
0
from pprint import pprint

import wikiclass
from revscoring import Model

scorer_model = Model.load(open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb'))

# Classifies a revision of an article based on wikitext alone
text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..."
prediction_results = wikiclass.score(scorer_model, text)

# Print predicted assessment class and probabilities for all classes.
pprint(("assessment", prediction_results['prediction']))
pprint(("probs", prediction_results['probability']))
#!/home/ellery/miniconda3/bin/python

import wikiclass
from revscoring.scorer_models import MLScorerModel
import requests
from collections import Counter
import time
import os

model = MLScorerModel.load(open(
  "/home/west1/github/wikiclass/models/enwiki.wp10.rf.model", "rb"))

datadir = os.environ['HOME'] + '/wikimedia/trunk/hoaxes/data/all_relevant_article_creation_content/'

print('\t'.join(['title', 'Stub', 'B', 'C', 'FA', 'Start', 'GA']))
for f in os.listdir(datadir):
  if f.endswith(".txt"):
    with open(datadir + f, 'r') as markup_file:
      markup = markup_file.read()
      obj = wikiclass.score(model, markup)
      print('\t'.join([f, str(obj['probability']['Stub']),
        str(obj['probability']['B']), str(obj['probability']['C']), str(obj['probability']['FA']),
        str(obj['probability']['Start']), str(obj['probability']['GA'])]))
  else:
    continue