Exemplo n.º 1
0
    def ingest_edits(cls, json_batch):
        # Map from (toolid, uid, user) to Batch object
        batches = {}
        model_edits = []
        reverted_ids = []
        deleted_pages = {}  # map: title -> latest deletion timestamp
        restored_pages = {}  # map: title -> latest restoration timestamp
        modified_pages = defaultdict(
            set)  # map: batch_key -> set of touched pages
        new_tags = defaultdict(set)

        tools = Tool.objects.all()

        for edit_json in json_batch:
            if not edit_json or edit_json.get(
                    'namespace') not in settings.WATCHED_NAMESPACES:
                continue
            timestamp = datetime.fromtimestamp(edit_json['timestamp'], tz=UTC)

            # First, check if this is a revert
            revert_match = cls.reverted_re.match(edit_json['comment'])
            if revert_match:
                reverted_ids.append(int(revert_match.group(1)))

            # or a deletion
            if edit_json.get('log_action') == 'delete':
                deleted_pages[edit_json['title']] = timestamp

            # or a restore
            if edit_json.get('log_action') == 'restore':
                restored_pages[edit_json['title']] = timestamp

            # Then, try to match the edit with a tool
            match = None
            matching_tool = None
            for tool in tools:
                match = tool.match(edit_json['user'], edit_json['comment'])
                if match is not None:
                    matching_tool = tool
                    break

            if match is None:
                continue

            # Try to find an existing batch for that edit
            batch_key = (matching_tool.shortid, match.uid)
            batch = batches.get(batch_key)

            created = False
            if not batch:
                batch, created = Batch.objects.get_or_create(
                    tool=tool,
                    uid=match.uid,
                    defaults={
                        'user': match.user[:MAX_CHARFIELD_LENGTH],
                        'summary': match.summary[:MAX_CHARFIELD_LENGTH],
                        'started': timestamp,
                        'ended': timestamp,
                        'nb_edits': 0,
                        'nb_distinct_pages': 0,
                        'nb_new_pages': 0,
                        'nb_reverted_edits': 0,
                        'total_diffsize': 0,
                    })

            # Check that the batch is owned by the right user
            if batch.user != match.user:
                if created:
                    batch.delete()
                continue

            batch.nb_edits += 1
            length_obj = edit_json.get('length') or {}
            batch.total_diffsize += (length_obj.get('new')
                                     or 0) - (length_obj.get('old') or 0)
            batch.ended = max(batch.ended, timestamp)

            batches[batch_key] = batch

            # Create the edit object
            model_edit = Edit.from_json(edit_json, batch)
            model_edits.append(model_edit)

            # Extract tags from the edit
            edit_tags = Tag.extract(model_edit)
            missing_tags = [
                tag.id for tag in edit_tags if tag.id not in batch.tag_ids
            ]
            new_tags[batch.id].update(missing_tags)

            # Take note of the modified page, for computation of the number of entities edited by a batch
            modified_pages[batch_key].add(edit_json['title'])
            # And the number of new pages
            if model_edit.changetype == 'new':
                batch.nb_new_pages += 1

        # if we saw some deletions which match any creations or undeletions we know of, mark them as deleted.
        # We do this before creating the previous edits in the same batch, because deletions and restorations
        # do not come with unique ids to identify the creation, deletion or restoration that they undo
        # (this is a notion that we introduce ourselves) so if a deletion and the corresponding revert happen
        # in the same batch we need to inspect the order in which they happened.
        if deleted_pages:
            cls.mark_as_reverted(
                Edit.objects.filter(title__in=deleted_pages.keys(),
                                    changetype__in=['new', 'restore']))
            for edit in model_edits:
                if (edit.title in deleted_pages
                        and edit.changetype in ['new', 'restore']
                        and edit.timestamp < deleted_pages.get(edit.title)):
                    edit.reverted = True
                    edit.batch.nb_reverted_edits += 1
        # finally if we saw some undeletions which match any deletions we know of, mark them as undone
        if restored_pages:
            cls.mark_as_reverted(
                Edit.objects.filter(title__in=restored_pages.keys(),
                                    changetype='delete'))
            for edit in model_edits:
                if (edit.title in restored_pages
                        and edit.changetype == 'delete'
                        and edit.timestamp < restored_pages.get(edit.title)):
                    edit.reverted = True
                    edit.batch.nb_reverted_edits += 1

        # Create all Edit objects update all the batch objects
        if batches:
            # Update the number of modified pages
            for batch_key, pages in modified_pages.items():
                batch = batches.get(batch_key)
                existing_pages = set(
                    batch.edits.filter(title__in=pages).values_list('title',
                                                                    flat=True))
                unseen_pages = pages - existing_pages
                batch.nb_distinct_pages += len(unseen_pages)

            # Create all the edit objects
            try:
                with transaction.atomic():
                    Edit.objects.bulk_create(model_edits)
            except IntegrityError as e:
                # Oops! Some of them existed already!
                # Let's add them one by one instead.
                for edit in model_edits:
                    try:
                        existing_edit = Edit.objects.get(id=edit.id)
                        # this edit was already seen: we need to remove it
                        # from the associated batch count
                        batch_key = (edit.batch.tool.shortid, edit.batch.uid)
                        batch = batches.get(batch_key)
                        if batch:
                            batch.nb_edits -= 1
                            batch.total_diffsize -= edit.newlength - edit.oldlength
                            if edit.changetype == 'new':
                                batch.nb_new_pages -= 1
                            if edit.reverted:
                                batch.nb_reverted_edits -= 1
                    except Edit.DoesNotExist:
                        edit.save()

            # update batch objects
            Batch.objects.bulk_update(list(batches.values()),
                                      update_fields=[
                                          'ended', 'nb_edits',
                                          'nb_distinct_pages',
                                          'nb_reverted_edits', 'nb_new_pages',
                                          'total_diffsize'
                                      ])

            # update tags for batches
            if new_tags:
                Tag.add_tags_to_batches(new_tags)

        # If we saw any "undo" edit, mark all matching edits as reverted.
        # We do this after creating the latest edits because it could be possible that
        # an edit from the batch we just processed was undone in the same go.
        if reverted_ids:
            cls.mark_as_reverted(
                Edit.objects.filter(newrevid__in=reverted_ids))
Exemplo n.º 2
0
    def ingest_edits(cls, json_batch):
        # Map from (toolid, uid, user) to Batch object
        batches = {}
        model_edits = []
        reverted_ids = []
        new_tags = defaultdict(set)

        tools = Tool.objects.all()

        for edit_json in json_batch:
            if not edit_json:
                continue
            timestamp = datetime.fromtimestamp(edit_json['timestamp'], tz=UTC)

            # First, check if this is a revert
            revert_match = cls.reverted_re.match(edit_json['comment'])
            if revert_match:
                reverted_ids.append(int(revert_match.group(1)))

            # Otherwise, try to match the edit with a tool
            match = None
            matching_tool = None
            for tool in tools:
                match = tool.match(edit_json['user'], edit_json['comment'])
                if match is not None:
                    matching_tool = tool
                    break

            if match is None:
                continue

            # Try to find an existing batch for that edit
            batch_key = (matching_tool.shortid, match.uid)
            batch = batches.get(batch_key)

            created = False
            if not batch:
                batch, created = Batch.objects.get_or_create(tool=tool,
                                                             uid=match.uid,
                                                             defaults={
                                                                 'user':
                                                                 match.user,
                                                                 'summary':
                                                                 match.summary,
                                                                 'started':
                                                                 timestamp,
                                                                 'ended':
                                                                 timestamp,
                                                                 'nb_edits': 0,
                                                             })

            # Check that the batch is owned by the right user
            if batch.user != match.user:
                if created:
                    batch.delete()
                continue

            batch.nb_edits += 1
            batch.ended = max(batch.ended, timestamp)

            batches[batch_key] = batch

            # Create the edit object
            model_edit = Edit.from_json(edit_json, batch)
            model_edits.append(model_edit)

            # Extract tags from the edit
            edit_tags = Tag.extract(model_edit)
            missing_tags = [
                tag.id for tag in edit_tags if tag.id not in batch.tag_ids
            ]
            new_tags[batch.id].update(missing_tags)

        # Create all Edit objects update all the batch objects
        if batches:
            # Create all the edit objects
            try:
                with transaction.atomic():
                    Edit.objects.bulk_create(model_edits)
            except IntegrityError as e:
                # Oops! Some of them existed already!
                # Let's add them one by one instead.
                for edit in model_edits:
                    try:
                        existing_edit = Edit.objects.get(id=edit.id)
                        # this edit was already seen: we need to remove it
                        # from the associated batch count
                        batch_key = (edit.batch.tool.shortid, edit.batch.uid)
                        batch = batches.get(batch_key)
                        if batch:
                            batch.nb_edits -= 1
                    except Edit.DoesNotExist:
                        edit.save()

            # update batch objects
            Batch.objects.bulk_update(list(batches.values()),
                                      update_fields=['ended', 'nb_edits'])

            # update tags for batches
            if new_tags:
                Tag.add_tags_to_batches(new_tags)

        # If we saw any "undo" edit, mark all matching edits as reverted
        if reverted_ids:
            Edit.objects.filter(newrevid__in=reverted_ids).update(
                reverted=True)