Пример #1
0
    def __init__(self,
                 diff_engine=None,
                 revert_radius=None,
                 revert_detector=None):
        if diff_engine is not None:
            if not hasattr(diff_engine, 'process'):
                raise TypeError("'diff_engine' of type {0} does not have a " +
                                "process() method.".format(type(diff_engine)))
            else:
                self.diff_engine = diff_engine
                self.diff_processor = self.diff_engine.processor()
        else:
            self.diff_engine, self.diff_processor = None, None

        # Either pass a detector or the revert radius so I can make one
        if revert_detector is None and revert_radius is None:
            raise TypeError("Either a 'revert_detector' or a " +
                            "'revert_radius' must be provided.")

        if revert_detector is None:
            self.revert_detector = mwreverts.Detector(int(revert_radius))
        else:
            self.revert_detector = revert_detector

        # Stores the last tokens
        self.last = Version()
Пример #2
0
    def extract(self, page, verbose=False):
        """
        Processes an :class:`mwxml.Page` and returns a generator of
        first-observations of a project/label pair.

        :Parameters:
            page : :class:`mwxml.Page`
                Page to process
            verbose : `bool`
                print dots to stderr

        """
        if page.namespace not in self.namespaces:
            pass
        else:
            if verbose:
                sys.stderr.write("\n{0}: ".format(page.title))
                sys.stderr.flush()

            revisions = OrderedDict()
            detector = mwreverts.Detector()

            # Process all of the revisions looking for reverts
            for revision in page:

                revert = detector.process(revision.sha1, revision.id)
                try:
                    revision_text = revision.text or ""
                    project_labels = set(
                        pl for pl in self.extract_labels(revision_text))
                except:
                    logger.warning("Could not extract labels from text:")
                    logger.warning(traceback.format_exc())
                    continue

                revisions[revision.id] = {
                    'id': revision.id,
                    'timestamp': revision.timestamp,
                    'was_reverted': False,
                    'is_a_revert': revert is not None,
                    'reverted': revert.reverteds if revert is not None else [],
                    'project_labels': project_labels
                }

                if revert is not None:
                    # This revision is a revert.
                    self.invert_reverted_status(
                        revisions[revision.id]['reverted'], revisions)

            # Re-process revisions only considering those that were not
            # reverted
            last_labels = set()
            for rev_id, revision in revisions.items():
                if revision['was_reverted']:
                    if verbose:
                        sys.stderr.write("r")
                        sys.stderr.flush()
                    continue

                # Get the new labels
                new_labels = revision['project_labels'] - last_labels
                last_labels = revision['project_labels']

                # Log some verbose stuff
                if verbose:
                    if len(new_labels) > 0:
                        sys.stderr.write("l")
                    else:
                        sys.stderr.write(".")
                    sys.stderr.flush()

                for project, label in new_labels:
                    yield {
                        'rev_id': revision['id'],
                        'timestamp': revision['timestamp'],
                        'project': project,
                        'wp10': label
                    }
Пример #3
0
    def extract(self, page, verbose=False):
        """
        Processes an :class:`mwxml.Page` and returns a generator of
        first-observations of a project/label pair.

        :Parameters:
            page : :class:`mwxml.Page`
                Page to process
            verbose : `bool`
                print dots to stderr

        """
        if page.namespace not in self.namespaces:
            pass
        else:
            if verbose:
                sys.stderr.write("\n{0}: ".format(page.title))
                sys.stderr.flush()

            labelings = {}
            last_labels = set()
            detector = mwreverts.Detector()

            # Process all of the revisions looking for new class labels
            for revision in page:

                revert = detector.process(revision.sha1, revision.id)

                try:
                    revision_text = revision.text or ""
                    project_labels = set(pl for pl in
                                         self.extract_labels(revision_text))
                except:
                    logger.warning("Could not extract labels from text:")
                    logger.warning(traceback.format_exc())
                    continue

                if revert is not None:
                    # This revision is a revert.
                    for rev_id in revert.reverteds:
                        if rev_id in labelings:
                            for lab in labelings[rev_id]:
                                lab['reverted'] = True

                    if revert.reverted_to in labelings:
                        for lab in labelings[revert.reverted_to]:
                            lab['reverted'] = False

                    if verbose:
                        sys.stderr.write("r")
                        sys.stderr.flush()
                else:
                    # This revision is not a revert.  Get the new labels
                    new_labels = project_labels - last_labels

                    # Log some verbose stuff
                    if verbose and len(new_labels) > 0:
                        sys.stderr.write("l")
                        sys.stderr.flush()
                    else:
                        sys.stderr.write(".")
                        sys.stderr.flush()

                    # Update lookup of rev_ids that affect labelings
                    if len(new_labels) > 0:
                        labelings[revision.id] = [
                            {'rev_id': revision.id,
                             'project': project,
                             'wp10': wp10,
                             'timestamp': revision.timestamp,
                             'reverted': False}
                            for project, wp10 in new_labels
                        ]

                # Update state so we make an appropriate comparison next time
                last_labels = project_labels

            # Find first labelings and filter out reverted labelings
            first_observations = {}
            for observations in labelings.values():
                for ob in observations:
                    if ob['reverted']:
                        continue
                    pair = (ob['project'], ob['wp10'])
                    if pair in first_observations:
                        if ob['timestamp'] < \
                           first_observations[pair]['timestamp']:
                            first_observations[pair] = ob
                    else:
                        first_observations[pair] = ob

            # All cleaned up.  Yield what we've got.
            for ob in first_observations.values():
                yield ob
Пример #4
0
    def process_dump(dump, path):
        for page in dump:
            detector = mwreverts.Detector(radius=revert_radius)
            window = deque(maxlen=revert_radius)
            for revision in page:
                revision.maybe_damaging = None
                revision.reason = None
                revert = detector.process(revision.sha1, revision)

                if start and revision.timestamp < start:
                    continue
                if end and revision.timestamp > end:
                    continue
                window.append(revision)

                if revert is not None:
                    # A revert!
                    for reverted in revert.reverteds:
                        if (revision.timestamp -
                            reverted.timestamp) <= revert_window and \
                           reverted.user is not None and \
                           revision.user is not None and \
                           reverted.user.text != revision.user.text and \
                           reverted.maybe_damaging is not False:
                            # Happened within the window
                            # wasn't a self revert and hasn't
                            # already been marked good.
                            reverted.maybe_damaging = True
                            reverted.reason = "Reverted by someone else"

                    if revert.reverted_to.maybe_damaging and \
                       revert.reverted_to.user.text != revision.user.text:
                        # Reverted back to my someone else.  Mark it good
                        # again.
                        revert.reverted_to.maybe_damaging = False
                        revert.reverted_to.reason = "Reverted back by " + \
                                                    "someone else"

                # Get user info
                load_user_data = trusted_edits or check_blocked
                if revision.user.id is not None and revision.user.id > 0 and \
                        load_user_data:
                    info = load_user_info(revision.user.text, session)
                else:
                    info = User(revision.user.id, 0, set())

                two_days_later = revision.timestamp + (60 * 60 * 24 * 2)
                if trusted_users and info.id in trusted_users:
                    revision.maybe_damaging = False
                    revision.reason = "In trusted group"
                elif check_blocked and user_recently_blocked(
                        revision.user.text, session, two_days_later):
                    # User was blocked. Edits may be damaging!
                    revision.maybe_damaging = True
                    revision.reason = "User was blocked from editing"
                elif trusted_edits and info.editcount >= trusted_edits:
                    revision.maybe_damaging = False
                    revision.reason = "Enough edits to be trusted"
                else:
                    revision.reason = "Unknown"

                if len(window) == revert_radius:
                    old_revision = window.popleft()
                    yield (old_revision.id, old_revision.maybe_damaging,
                           old_revision.reason)

            for old_revision in window:
                yield (old_revision.id, old_revision.maybe_damaging,
                       old_revision.reason)
Пример #5
0
def process_dump(dump):
    detector_start_date = datetime.fromisoformat(
        '2014-01-01')  # 5 years before the start of 2019
    detector_start_timestamp = int(detector_start_date.timestamp())
    start_date = datetime.fromisoformat('2019-01-01').replace(
        tzinfo=pytz.UTC)  # start of 2019
    start_timestamp = int(start_date.timestamp())
    midpoint_date = datetime.fromisoformat('2020-01-01').replace(
        tzinfo=pytz.UTC)
    midpoint_timestamp = int(midpoint_date.timestamp())
    end_date = datetime.fromisoformat('2021-01-01').replace(tzinfo=pytz.UTC)
    end_timestamp = int(end_date.timestamp())
    duplicate_reverted_count = 0
    for page in dump:
        is_page_redirect = int(page.redirect is not None)
        page_namespace = page.namespace
        page_id = page.id
        rev_count = 0
        target_range_rev_count = 0
        target_range_midpoint_rev_count = 0

        rev_list = []
        rev_dict = {}
        rev_user_text_dict = {}

        prev_timestamp = None
        prev_rev_size_bytes = None

        # we use a new detector for each page
        detector = mwreverts.Detector(radius=15)
        for revision in page:
            rev_count += 1

            # convert each revision to json and extract the relevant info from it
            rev_doc = revision.to_json()
            rev_id = rev_doc['id']
            rev_timestamp = int(
                dateutil.parser.isoparse(rev_doc['timestamp']).timestamp())
            rev_size_bytes = rev_doc['bytes']

            if rev_timestamp < detector_start_timestamp:
                # skip all initial page revisions
                prev_timestamp = rev_timestamp
                prev_rev_size_bytes = rev_size_bytes
                continue
            elif rev_timestamp > end_timestamp:
                # skip all revisions after the period of interest
                continue

            rev_user_text = ""
            rev_user_id = ""
            if 'user' in rev_doc:
                rev_user_text = rev_doc['user']['text'].replace(
                    '\t', '\\t').replace(
                        '\n', '\\n') if 'text' in rev_doc['user'] else None
                rev_user_id = rev_doc['user']['id'] if 'id' in rev_doc[
                    'user'] else None

            if rev_timestamp < start_timestamp:
                # process the sha1 so that we can identify reverts once we are in the range of interest
                checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum()
                detector.process(checksum, rev_doc)
                prev_timestamp = rev_timestamp
                prev_rev_size_bytes = rev_size_bytes
                rev_user_text_dict[rev_id] = rev_user_text
                continue
            # after this point, we are after 2018!
            target_range_rev_count += 1
            if rev_timestamp < midpoint_timestamp:
                # still before the midpoint
                target_range_midpoint_rev_count += 1

            seconds_to_prev = None
            if prev_timestamp is not None:
                seconds_to_prev = rev_timestamp - prev_timestamp
            delta_bytes = None
            if prev_rev_size_bytes is not None:
                delta_bytes = rev_size_bytes - prev_rev_size_bytes

            assert rev_doc['page']['id'] == page_id
            assert rev_doc['page']['namespace'] == page_namespace
            page_title = rev_doc['page']['title']

            rev_data = {
                'page_id': page_id,
                'rev_id': rev_id,
                'prev_rev_id':
                rev_doc['parent_id'] if 'parent_id' in rev_doc else None,
                'is_minor': rev_doc['minor'],
                'user_text': rev_user_text,
                'user_id': rev_user_id,
                'rev_timestamp': rev_timestamp,
                'seconds_to_prev': seconds_to_prev,
                'curr_bytes': rev_size_bytes,
                'delta_bytes': delta_bytes,
                #'edit_summary': rev_doc['comment'] if 'comment' in rev_doc else None,
                'has_edit_summary': 'comment' in rev_doc,
                'is_reverted': False,
                'is_revert': False,
                'is_reverted_to_by_other': False,
                'is_self_reverted': False,
                'is_self_revert': False,
                'revert_target_id': None,
                'revert_set_size': None,
                'revert_id': None,
                'seconds_to_revert': None,
            }
            rev_list.append(rev_data)
            rev_dict[rev_id] = rev_data

            # now, we check if we have identified a new revert
            checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum()
            revert = detector.process(checksum, rev_doc)

            # we only consider reverts in the target timerange
            if revert:
                revert_json = revert.to_json()

                reverting_id = revert_json['reverting']['id']
                reverted_to_id = revert_json['reverted_to']['id']
                reverteds_ids = [rev['id'] for rev in revert_json['reverteds']]

                assert reverting_id == rev_id
                rev_data['is_revert'] = True
                rev_data['revert_target_id'] = reverted_to_id
                rev_data['revert_set_size'] = len(reverteds_ids)

                # compute is_reverted_to_by_other
                if reverted_to_id in rev_dict:
                    # else the reverted_to target happened before the target period
                    reverted_to_rev_data = rev_dict[reverted_to_id]
                    # once true, is_reverted_to_by_other is always true
                    if not reverted_to_rev_data['is_reverted_to_by_other']:
                        # is_reverted_to_by_other means "was a revert target"
                        # BUT a revert target from a revert authored by a DIFFERENT user
                        reverted_to_rev_data[
                            'is_reverted_to_by_other'] = rev_data[
                                'user_text'] != reverted_to_rev_data[
                                    'user_text']

                is_self_revert = rev_data[
                    'user_text'] is not None  # true in most cases; if false, not enough info to know if a self revert

                # identify the reverted revisions
                # which is all revs in reverteds_ids that have not previously been reverted
                actual_reverteds_ids = set()
                revert_set_size = 0
                for rev_id in reverteds_ids:
                    if rev_id not in rev_dict:
                        # this revision happened before the target period
                        revert_set_size += 1
                        if rev_id in rev_user_text_dict:
                            if rev_user_text_dict[
                                    rev_id] is None or rev_user_text_dict[
                                        rev_id] != rev_data['user_text']:
                                is_self_revert = False
                        else:
                            # note: in rare circumstances, we can miss self-reverts, iff
                            # (a) reverted rev is before 2014, (b) reverting rev is after 2019
                            # In this case, we assume not a self revert
                            is_self_revert = False
                            print(
                                f"Revision {rev_id} reverted by revision {reverting_id} after more than 5 years."
                            )
                        continue
                    if not rev_dict[rev_id]['is_reverted']:
                        # this revision is already considered to be reverted by an intervening revision
                        actual_reverteds_ids.add(rev_id)
                        revert_set_size += 1
                    else:
                        duplicate_reverted_count += 1
                assert revert_set_size > 0, str(
                    revert_set_size) + " size set: " + str(
                        reverted_to_id) + " " + str(reverteds_ids) + " " + str(
                            reverting_id)
                rev_data['revert_set_size'] = revert_set_size
                if len(actual_reverteds_ids) == 0:
                    # no actual revisions to update as reverted, so assume not a self-revert
                    is_self_revert = False

                # update the data of the reverted revisions
                for rev_id in actual_reverteds_ids:
                    reverted_rev_data = rev_dict[rev_id]
                    reverted_rev_data['is_reverted'] = True
                    reverted_rev_data['revert_id'] = reverting_id
                    if reverted_rev_data[
                            'user_text'] is None or reverted_rev_data[
                                'user_text'] != rev_data['user_text']:
                        # at least one reverted id is not by this user, so not a self-revert
                        is_self_revert = False
                    reverted_rev_data['seconds_to_revert'] = rev_data[
                        'rev_timestamp'] - reverted_rev_data['rev_timestamp']
                    reverted_rev_data['revert_target_id'] = reverted_to_id
                    reverted_rev_data['revert_set_size'] = revert_set_size
                rev_data['is_self_revert'] = is_self_revert
                if is_self_revert:
                    # need to update all of the reverteds as well
                    for rev_id in actual_reverteds_ids:
                        reverted_rev_data = rev_dict[rev_id]
                        reverted_rev_data['is_self_reverted'] = True

            prev_timestamp = rev_timestamp
            prev_rev_size_bytes = rev_size_bytes

        if target_range_rev_count > 0:
            # emit page info
            page_info = {
                'page_id': page_id,
                'wiki_namespace': page_namespace,
                'page_title': page_title,
                'full_rev_count':
                target_range_rev_count,  # corresponds to 2019-2021 revs
                'range_rev_count':
                target_range_midpoint_rev_count,  # corresponds to 2019-2020 revs
                'is_page_redirect': is_page_redirect,
            }
            yield page_info
            # emit revision data from target range
            for rev_data in rev_list:
                yield rev_data
    if duplicate_reverted_count > 0:
        print(
            f"Identified {duplicate_reverted_count} reverted ids that turned out to be already reverted."
        )
def process_dump(dump):
    start_date = datetime.fromisoformat('2010-01-01')
    start_timestamp = int(start_date.timestamp())
    end_date = datetime.fromisoformat('2020-01-01')
    end_timestamp = int(end_date.timestamp())
    for page in dump:
        is_page_redirect = int(page.redirect is not None)
        page_namespace = page.namespace
        page_id = page.id
        rev_count = 0

        rev_tups = []
        is_revert_target_set = set()
        is_reverted_set = set()
        is_reverting_set = set()

        # we use a new detector for each page
        detector = mwreverts.Detector(radius=15)
        for revision in page:
            rev_count += 1

            # convert each revision to json and extract the relevant info from it
            rev_doc = revision.to_json()
            rev_id = rev_doc['id']
            rev_timestamp = int(
                datetime.strptime(rev_doc['timestamp'],
                                  "%Y-%m-%dT%H:%M:%SZ").timestamp())
            rev_user_text = ""
            rev_user_id = ""
            if 'user' in rev_doc:
                rev_user_text = rev_doc['user']['text'] if 'text' in rev_doc[
                    'user'] else ""
                rev_user_id = rev_doc['user']['id'] if 'id' in rev_doc[
                    'user'] else ""
            rev_tup = [
                page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id
            ]
            rev_tups.append(rev_tup)

            # now, we check if we have identified a new revert
            checksum = rev_doc.get('sha1') or mwreverts.DummyChecksum()
            revert = detector.process(checksum, rev_doc)

            # we only consider reverts in the target timerange
            if revert and rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp:
                revert_json = revert.to_json()

                reverting_id = revert_json['reverting']['id']
                reverted_to_id = revert_json['reverted_to']['id']
                reverteds_ids = [rev['id'] for rev in revert_json['reverteds']]

                # keep track of which revision ids are reverts/reverting/reverted-to-targets
                is_reverting_set.add(reverting_id)
                is_revert_target_set.add(reverted_to_id)
                is_reverted_set.update(reverteds_ids)

        # having processed for reverts, we output all revisions along with their types back to the central process
        for rev_tup in rev_tups:
            page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id = rev_tup
            if rev_timestamp >= start_timestamp and rev_timestamp <= end_timestamp:
                is_revert_target = int(rev_id in is_revert_target_set)
                is_reverted = int(rev_id in is_reverted_set)
                is_reverting = int(rev_id in is_reverting_set)
                yield page_id, page_namespace, is_page_redirect, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting