def __init__(self, diff_engine=None, revert_radius=None, revert_detector=None): if diff_engine is not None: if not hasattr(diff_engine, 'process'): raise TypeError("'diff_engine' of type {0} does not have a " + "process() method.".format(type(diff_engine))) else: self.diff_engine = diff_engine self.diff_processor = self.diff_engine.processor() else: self.diff_engine, self.diff_processor = None, None # Either pass a detector or the revert radius so I can make one if revert_detector is None and revert_radius is None: raise TypeError("Either a 'revert_detector' or a " + "'revert_radius' must be provided.") if revert_detector is None: self.revert_detector = mwreverts.Detector(int(revert_radius)) else: self.revert_detector = revert_detector # Stores the last tokens self.last = Version()
def extract(self, page, verbose=False): """ Processes an :class:`mwxml.Page` and returns a generator of first-observations of a project/label pair. :Parameters: page : :class:`mwxml.Page` Page to process verbose : `bool` print dots to stderr """ if page.namespace not in self.namespaces: pass else: if verbose: sys.stderr.write("\n{0}: ".format(page.title)) sys.stderr.flush() revisions = OrderedDict() detector = mwreverts.Detector() # Process all of the revisions looking for reverts for revision in page: revert = detector.process(revision.sha1, revision.id) try: revision_text = revision.text or "" project_labels = set( pl for pl in self.extract_labels(revision_text)) except: logger.warning("Could not extract labels from text:") logger.warning(traceback.format_exc()) continue revisions[revision.id] = { 'id': revision.id, 'timestamp': revision.timestamp, 'was_reverted': False, 'is_a_revert': revert is not None, 'reverted': revert.reverteds if revert is not None else [], 'project_labels': project_labels } if revert is not None: # This revision is a revert. self.invert_reverted_status( revisions[revision.id]['reverted'], revisions) # Re-process revisions only considering those that were not # reverted last_labels = set() for rev_id, revision in revisions.items(): if revision['was_reverted']: if verbose: sys.stderr.write("r") sys.stderr.flush() continue # Get the new labels new_labels = revision['project_labels'] - last_labels last_labels = revision['project_labels'] # Log some verbose stuff if verbose: if len(new_labels) > 0: sys.stderr.write("l") else: sys.stderr.write(".") sys.stderr.flush() for project, label in new_labels: yield { 'rev_id': revision['id'], 'timestamp': revision['timestamp'], 'project': project, 'wp10': label }
def extract(self, page, verbose=False): """ Processes an :class:`mwxml.Page` and returns a generator of first-observations of a project/label pair. :Parameters: page : :class:`mwxml.Page` Page to process verbose : `bool` print dots to stderr """ if page.namespace not in self.namespaces: pass else: if verbose: sys.stderr.write("\n{0}: ".format(page.title)) sys.stderr.flush() labelings = {} last_labels = set() detector = mwreverts.Detector() # Process all of the revisions looking for new class labels for revision in page: revert = detector.process(revision.sha1, revision.id) try: revision_text = revision.text or "" project_labels = set(pl for pl in self.extract_labels(revision_text)) except: logger.warning("Could not extract labels from text:") logger.warning(traceback.format_exc()) continue if revert is not None: # This revision is a revert. for rev_id in revert.reverteds: if rev_id in labelings: for lab in labelings[rev_id]: lab['reverted'] = True if revert.reverted_to in labelings: for lab in labelings[revert.reverted_to]: lab['reverted'] = False if verbose: sys.stderr.write("r") sys.stderr.flush() else: # This revision is not a revert. Get the new labels new_labels = project_labels - last_labels # Log some verbose stuff if verbose and len(new_labels) > 0: sys.stderr.write("l") sys.stderr.flush() else: sys.stderr.write(".") sys.stderr.flush() # Update lookup of rev_ids that affect labelings if len(new_labels) > 0: labelings[revision.id] = [ {'rev_id': revision.id, 'project': project, 'wp10': wp10, 'timestamp': revision.timestamp, 'reverted': False} for project, wp10 in new_labels ] # Update state so we make an appropriate comparison next time last_labels = project_labels # Find first labelings and filter out reverted labelings first_observations = {} for observations in labelings.values(): for ob in observations: if ob['reverted']: continue pair = (ob['project'], ob['wp10']) if pair in first_observations: if ob['timestamp'] < \ first_observations[pair]['timestamp']: first_observations[pair] = ob else: first_observations[pair] = ob # All cleaned up. Yield what we've got. for ob in first_observations.values(): yield ob
def process_dump(dump, path): for page in dump: detector = mwreverts.Detector(radius=revert_radius) window = deque(maxlen=revert_radius) for revision in page: revision.maybe_damaging = None revision.reason = None revert = detector.process(revision.sha1, revision) if start and revision.timestamp < start: continue if end and revision.timestamp > end: continue window.append(revision) if revert is not None: # A revert! for reverted in revert.reverteds: if (revision.timestamp - reverted.timestamp) <= revert_window and \ reverted.user is not None and \ revision.user is not None and \ reverted.user.text != revision.user.text and \ reverted.maybe_damaging is not False: # Happened within the window # wasn't a self revert and hasn't # already been marked good. reverted.maybe_damaging = True reverted.reason = "Reverted by someone else" if revert.reverted_to.maybe_damaging and \ revert.reverted_to.user.text != revision.user.text: # Reverted back to my someone else. Mark it good # again. revert.reverted_to.maybe_damaging = False revert.reverted_to.reason = "Reverted back by " + \ "someone else" # Get user info load_user_data = trusted_edits or check_blocked if revision.user.id is not None and revision.user.id > 0 and \ load_user_data: info = load_user_info(revision.user.text, session) else: info = User(revision.user.id, 0, set()) two_days_later = revision.timestamp + (60 * 60 * 24 * 2) if trusted_users and info.id in trusted_users: revision.maybe_damaging = False revision.reason = "In trusted group" elif check_blocked and user_recently_blocked( revision.user.text, session, two_days_later): # User was blocked. Edits may be damaging! revision.maybe_damaging = True revision.reason = "User was blocked from editing" elif trusted_edits and info.editcount >= trusted_edits: revision.maybe_damaging = False revision.reason = "Enough edits to be trusted" else: revision.reason = "Unknown" if len(window) == revert_radius: old_revision = window.popleft() yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason) for old_revision in window: yield (old_revision.id, old_revision.maybe_damaging, old_revision.reason)
def process_dump(dump): detector_start_date = datetime.fromisoformat( '2014-01-01') # 5 years before the start of 2019 detector_start_timestamp = int(detector_start_date.timestamp()) start_date = datetime.fromisoformat('2019-01-01').replace( tzinfo=pytz.UTC) # start of 2019 start_timestamp = int(start_date.timestamp()) midpoint_date = datetime.fromisoformat('2020-01-01').replace( tzinfo=pytz.UTC) midpoint_timestamp = int(midpoint_date.timestamp()) end_date = datetime.fromisoformat('2021-01-01').replace(tzinfo=pytz.UTC) end_timestamp = int(end_date.timestamp()) duplicate_reverted_count = 0 for page in dump: is_page_redirect = int(page.redirect is not None) page_namespace = page.namespace page_id = page.id rev_count = 0 target_range_rev_count = 0 target_range_midpoint_rev_count = 0 rev_list = [] rev_dict = {} rev_user_text_dict = {} prev_timestamp = None prev_rev_size_bytes = None # we use a new detector for each page detector = mwreverts.Detector(radius=15) for revision in page: rev_count += 1 # convert each revision to json and extract the relevant info from it rev_doc = revision.to_json() rev_id = rev_doc['id'] rev_timestamp = int( dateutil.parser.isoparse(rev_doc['timestamp']).timestamp()) rev_size_bytes = rev_doc['bytes'] if rev_timestamp < detector_start_timestamp: # skip all initial page revisions prev_timestamp = rev_timestamp prev_rev_size_bytes = rev_size_bytes continue elif rev_timestamp > end_timestamp: # skip all revisions after the period of interest continue rev_user_text = "" rev_user_id = "" if 'user' in rev_doc: rev_user_text = rev_doc['user']['text'].replace( '\t', '\\t').replace( '\n', '\\n') if 'text' in rev_doc['user'] else None rev_user_id = rev_doc['user']['id'] if 'id' in rev_doc[ 'user'] else None if rev_timestamp < start_timestamp: # process the sha1 so that we can identify reverts once we are in the range of interest checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum() detector.process(checksum, rev_doc) prev_timestamp = rev_timestamp prev_rev_size_bytes = rev_size_bytes rev_user_text_dict[rev_id] = rev_user_text continue # after this point, we are after 2018! target_range_rev_count += 1 if rev_timestamp < midpoint_timestamp: # still before the midpoint target_range_midpoint_rev_count += 1 seconds_to_prev = None if prev_timestamp is not None: seconds_to_prev = rev_timestamp - prev_timestamp delta_bytes = None if prev_rev_size_bytes is not None: delta_bytes = rev_size_bytes - prev_rev_size_bytes assert rev_doc['page']['id'] == page_id assert rev_doc['page']['namespace'] == page_namespace page_title = rev_doc['page']['title'] rev_data = { 'page_id': page_id, 'rev_id': rev_id, 'prev_rev_id': rev_doc['parent_id'] if 'parent_id' in rev_doc else None, 'is_minor': rev_doc['minor'], 'user_text': rev_user_text, 'user_id': rev_user_id, 'rev_timestamp': rev_timestamp, 'seconds_to_prev': seconds_to_prev, 'curr_bytes': rev_size_bytes, 'delta_bytes': delta_bytes, #'edit_summary': rev_doc['comment'] if 'comment' in rev_doc else None, 'has_edit_summary': 'comment' in rev_doc, 'is_reverted': False, 'is_revert': False, 'is_reverted_to_by_other': False, 'is_self_reverted': False, 'is_self_revert': False, 'revert_target_id': None, 'revert_set_size': None, 'revert_id': None, 'seconds_to_revert': None, } rev_list.append(rev_data) rev_dict[rev_id] = rev_data # now, we check if we have identified a new revert checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum() revert = detector.process(checksum, rev_doc) # we only consider reverts in the target timerange if revert: revert_json = revert.to_json() reverting_id = revert_json['reverting']['id'] reverted_to_id = revert_json['reverted_to']['id'] reverteds_ids = [rev['id'] for rev in revert_json['reverteds']] assert reverting_id == rev_id rev_data['is_revert'] = True rev_data['revert_target_id'] = reverted_to_id rev_data['revert_set_size'] = len(reverteds_ids) # compute is_reverted_to_by_other if reverted_to_id in rev_dict: # else the reverted_to target happened before the target period reverted_to_rev_data = rev_dict[reverted_to_id] # once true, is_reverted_to_by_other is always true if not reverted_to_rev_data['is_reverted_to_by_other']: # is_reverted_to_by_other means "was a revert target" # BUT a revert target from a revert authored by a DIFFERENT user reverted_to_rev_data[ 'is_reverted_to_by_other'] = rev_data[ 'user_text'] != reverted_to_rev_data[ 'user_text'] is_self_revert = rev_data[ 'user_text'] is not None # true in most cases; if false, not enough info to know if a self revert # identify the reverted revisions # which is all revs in reverteds_ids that have not previously been reverted actual_reverteds_ids = set() revert_set_size = 0 for rev_id in reverteds_ids: if rev_id not in rev_dict: # this revision happened before the target period revert_set_size += 1 if rev_id in rev_user_text_dict: if rev_user_text_dict[ rev_id] is None or rev_user_text_dict[ rev_id] != rev_data['user_text']: is_self_revert = False else: # note: in rare circumstances, we can miss self-reverts, iff # (a) reverted rev is before 2014, (b) reverting rev is after 2019 # In this case, we assume not a self revert is_self_revert = False print( f"Revision {rev_id} reverted by revision {reverting_id} after more than 5 years." ) continue if not rev_dict[rev_id]['is_reverted']: # this revision is already considered to be reverted by an intervening revision actual_reverteds_ids.add(rev_id) revert_set_size += 1 else: duplicate_reverted_count += 1 assert revert_set_size > 0, str( revert_set_size) + " size set: " + str( reverted_to_id) + " " + str(reverteds_ids) + " " + str( reverting_id) rev_data['revert_set_size'] = revert_set_size if len(actual_reverteds_ids) == 0: # no actual revisions to update as reverted, so assume not a self-revert is_self_revert = False # update the data of the reverted revisions for rev_id in actual_reverteds_ids: reverted_rev_data = rev_dict[rev_id] reverted_rev_data['is_reverted'] = True reverted_rev_data['revert_id'] = reverting_id if reverted_rev_data[ 'user_text'] is None or reverted_rev_data[ 'user_text'] != rev_data['user_text']: # at least one reverted id is not by this user, so not a self-revert is_self_revert = False reverted_rev_data['seconds_to_revert'] = rev_data[ 'rev_timestamp'] - reverted_rev_data['rev_timestamp'] reverted_rev_data['revert_target_id'] = reverted_to_id reverted_rev_data['revert_set_size'] = revert_set_size rev_data['is_self_revert'] = is_self_revert if is_self_revert: # need to update all of the reverteds as well for rev_id in actual_reverteds_ids: reverted_rev_data = rev_dict[rev_id] reverted_rev_data['is_self_reverted'] = True prev_timestamp = rev_timestamp prev_rev_size_bytes = rev_size_bytes if target_range_rev_count > 0: # emit page info page_info = { 'page_id': page_id, 'wiki_namespace': page_namespace, 'page_title': page_title, 'full_rev_count': target_range_rev_count, # corresponds to 2019-2021 revs 'range_rev_count': target_range_midpoint_rev_count, # corresponds to 2019-2020 revs 'is_page_redirect': is_page_redirect, } yield page_info # emit revision data from target range for rev_data in rev_list: yield rev_data if duplicate_reverted_count > 0: print( f"Identified {duplicate_reverted_count} reverted ids that turned out to be already reverted." )
def process_dump(dump): start_date = datetime.fromisoformat('2010-01-01') start_timestamp = int(start_date.timestamp()) end_date = datetime.fromisoformat('2020-01-01') end_timestamp = int(end_date.timestamp()) for page in dump: is_page_redirect = int(page.redirect is not None) page_namespace = page.namespace page_id = page.id rev_count = 0 rev_tups = [] is_revert_target_set = set() is_reverted_set = set() is_reverting_set = set() # we use a new detector for each page detector = mwreverts.Detector(radius=15) for revision in page: rev_count += 1 # convert each revision to json and extract the relevant info from it rev_doc = revision.to_json() rev_id = rev_doc['id'] rev_timestamp = int( datetime.strptime(rev_doc['timestamp'], "%Y-%m-%dT%H:%M:%SZ").timestamp()) rev_user_text = "" rev_user_id = "" if 'user' in rev_doc: rev_user_text = rev_doc['user']['text'] if 'text' in rev_doc[ 'user'] else "" rev_user_id = rev_doc['user']['id'] if 'id' in rev_doc[ 'user'] else "" rev_tup = [ page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id ] rev_tups.append(rev_tup) # now, we check if we have identified a new revert checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum() revert = detector.process(checksum, rev_doc) # we only consider reverts in the target timerange if revert and rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp: revert_json = revert.to_json() reverting_id = revert_json['reverting']['id'] reverted_to_id = revert_json['reverted_to']['id'] reverteds_ids = [rev['id'] for rev in revert_json['reverteds']] # keep track of which revision ids are reverts/reverting/reverted-to-targets is_reverting_set.add(reverting_id) is_revert_target_set.add(reverted_to_id) is_reverted_set.update(reverteds_ids) # having processed for reverts, we output all revisions along with their types back to the central process for rev_tup in rev_tups: page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id = rev_tup if rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp: is_revert_target = int(rev_id in is_revert_target_set) is_reverted = int(rev_id in is_reverted_set) is_reverting = int(rev_id in is_reverting_set) yield page_id, page_namespace, is_page_redirect, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting