Пример #1
0
    def handle(self, **options):
        self.days = options['days']
        self.concurrency = options['concurrency']
        self.project = options['project']

        self.stdout.write("Removing expired values for LostPasswordHash\n")
        LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

        if self.project:
            self.stderr.write("Bulk NodeStore deletion not available for project selection\n")
        else:
            self.stdout.write("Removing old NodeStore values\n")
            cutoff = timezone.now() - timedelta(days=self.days)
            try:
                nodestore.cleanup(cutoff)
            except NotImplementedError:
                self.stderr.write("NodeStore backend does not support cleanup operation\n")

        for model, dtfield in self.BULK_DELETES:
            self.stdout.write("Removing {model} for days={days} project={project}\n".format(
                model=model.__name__,
                days=self.days,
                project=self.project or '*',
            ))
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=self.days,
                project_id=self.project,
            ).execute()

        # EventMapping is fairly expensive and is special cased as it's likely you
        # won't need a reference to an event for nearly as long
        self.stdout.write("Removing expired values for EventMapping\n")
        BulkDeleteQuery(
            model=EventMapping,
            dtfield='date_added',
            days=min(self.days, 7),
            project_id=self.project,
        ).execute()

        for model, dtfield in self.GENERIC_DELETES:
            self.stdout.write("Removing {model} for days={days} project={project}\n".format(
                model=model.__name__,
                days=self.days,
                project=self.project or '*',
            ))
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=self.days,
                project_id=self.project,
            ).execute_generic()
Пример #2
0
    def handle(self, **options):
        self.days = options['days']
        self.concurrency = options['concurrency']
        self.project = options['project']

        self.stdout.write("Removing expired values for LostPasswordHash\n")
        LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

        if self.project:
            self.stderr.write("Bulk NodeStore deletion not available for project selection\n")
        else:
            self.stdout.write("Removing old NodeStore values\n")
            cutoff = timezone.now() - timedelta(days=self.days)
            try:
                nodestore.cleanup(cutoff)
            except NotImplementedError:
                self.stderr.write("NodeStore backend does not support cleanup operation\n")

        for model, dtfield in self.BULK_DELETES:
            self.stdout.write("Removing {model} for days={days} project={project}\n".format(
                model=model.__name__,
                days=self.days,
                project=self.project or '*',
            ))
            self.bulk_delete(model, dtfield)

        for model, dtfield in self.GENERIC_DELETES:
            self.stdout.write("Removing {model} for days={days} project={project}\n".format(
                model=model.__name__,
                days=self.days,
                project=self.project or '*',
            ))
            self.generic_delete(model, dtfield)

        # EventMapping is fairly expensive and is special cased as it's likely you
        # won't need a reference to an event for nearly as long
        self.stdout.write("Removing expired values for EventMapping\n")
        self.bulk_delete(EventMapping, 'date_added', days=min(self.days, 7))
Пример #3
0
def cleanup(days, project, concurrency, max_procs, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    import math
    import multiprocessing
    import pickle
    import subprocess
    import sys
    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.GroupHashTombstone, 'deleted_at', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=timezone.now()).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo(
                "NodeStore backend does not support cleanup operation", err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            if concurrency > 1:
                shard_ids = range(concurrency)
                num_procs = min(multiprocessing.cpu_count(), max_procs)
                threads_per_proc = int(math.ceil(
                    concurrency / float(num_procs)))

                pids = []
                for shard_id_chunk in chunker(shard_ids, threads_per_proc):
                    pid = subprocess.Popen([
                        sys.argv[0],
                        'cleanup_chunk',
                        '--days', six.binary_type(days),
                    ] + (['--project_id', six.binary_type(project_id)] if project_id else []) + [
                        '--model', pickle.dumps(model),
                        '--dtfield', dtfield,
                        '--order_by', order_by,
                        '--num_shards', six.binary_type(concurrency),
                        '--shard_ids', ",".join([six.binary_type(s)
                                                 for s in shard_id_chunk]),
                    ])
                    pids.append(pid)

                total_pid_count = len(pids)
                click.echo(
                    "%s concurrent processes forked, waiting on them to complete." % total_pid_count)

                complete = 0
                for pid in pids:
                    pid.wait()
                    complete += 1
                    click.echo(
                        "%s/%s concurrent processes are finished." % (complete, total_pid_count))

            else:
                task = create_deletion_task(
                    days, project_id, model, dtfield, order_by)
                _chunk_until_complete(task)

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
Пример #4
0
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    os.environ['_SENTRY_CLEANUP'] = '1'

    # Make sure we fork off multiprocessing pool
    # before we import or configure the app
    from multiprocessing import Process, JoinableQueue as Queue

    pool = []
    task_queue = Queue(1000)
    for _ in xrange(concurrency):
        p = Process(target=multiprocess_worker, args=(task_queue,))
        p.daemon = True
        p.start()
        pool.append(p)

    from sentry.runner import configure
    configure()

    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.EventAttachment, 'date_added', None),
        (models.UserReport, 'date_added', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    if is_filtered(models.OrganizationMember) and not silent:
        click.echo('>> Skipping OrganizationMember')
    else:
        click.echo('Removing expired values for OrganizationMember')
        expired_threshold = timezone.now() - timedelta(days=days)
        models.OrganizationMember.delete_expired(expired_threshold)

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo(u'Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo(u'>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(
                expires_at__lt=(timezone.now() - timedelta(days=days)),
            ).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo(
                "NodeStore backend does not support cleanup operation", err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            imp = '.'.join((model.__module__, model.__name__))

            q = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            )

            for chunk in q.iterator(chunk_size=100):
                task_queue.put((imp, chunk))

            task_queue.join()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    # Shut down our pool
    for _ in pool:
        task_queue.put(_STOP_WORKER)

    # And wait for it to drain
    for p in pool:
        p.join()

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
Пример #5
0
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    os.environ['_SENTRY_CLEANUP'] = '1'

    # Make sure we fork off multiprocessing pool
    # before we import or configure the app
    from multiprocessing import Process, JoinableQueue as Queue

    pool = []
    task_queue = Queue(1000)
    for _ in xrange(concurrency):
        p = Process(target=multiprocess_worker, args=(task_queue, ))
        p.daemon = True
        p.start()
        pool.append(p)

    from sentry.runner import configure
    configure()

    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.EventAttachment, 'date_added', None),
        (models.UserReport, 'date_added', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                               timedelta(hours=48)).delete()

    if is_filtered(models.OrganizationMember) and not silent:
        click.echo('>> Skipping OrganizationMember')
    else:
        if not silent:
            click.echo('Removing expired values for OrganizationMember')
        expired_threshold = timezone.now() - timedelta(days=days)
        models.OrganizationMember.delete_expired(expired_threshold)

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo(u'Removing expired values for {}'.format(
                model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo(u'>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=(
                timezone.now() -
                timedelta(days=API_TOKEN_TTL_IN_DAYS)), ).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation",
                       err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            imp = '.'.join((model.__module__, model.__name__))

            q = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            )

            for chunk in q.iterator(chunk_size=100):
                task_queue.put((imp, chunk))

            task_queue.join()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    # Shut down our pool
    for _ in pool:
        task_queue.put(_STOP_WORKER)

    # And wait for it to drain
    for p in pool:
        p.join()

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration',
                       duration,
                       instance=router,
                       sample_rate=1.0)
        click.echo("Clean up took %s second(s)." % duration)
def cleanup(days, project, concurrency, max_procs, silent, model, router,
            timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    import math
    import multiprocessing
    import pickle
    import subprocess
    import sys
    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.GroupHashTombstone, 'deleted_at', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                               timedelta(hours=48)).delete()

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=timezone.now()).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation",
                       err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            if concurrency > 1:
                shard_ids = range(concurrency)
                num_procs = min(multiprocessing.cpu_count(), max_procs)
                threads_per_proc = int(
                    math.ceil(concurrency / float(num_procs)))

                pids = []
                for shard_id_chunk in chunker(shard_ids, threads_per_proc):
                    pid = subprocess.Popen([
                        sys.argv[0],
                        'cleanup_chunk',
                        '--days',
                        six.binary_type(days),
                    ] + (
                        ['--project_id',
                         six.binary_type(project_id)] if project_id else []
                    ) + [
                        '--model',
                        pickle.dumps(model),
                        '--dtfield',
                        dtfield,
                        '--order_by',
                        order_by,
                        '--num_shards',
                        six.binary_type(concurrency),
                        '--shard_ids',
                        ",".join([six.binary_type(s) for s in shard_id_chunk]),
                    ])
                    pids.append(pid)

                total_pid_count = len(pids)
                click.echo(
                    "%s concurrent processes forked, waiting on them to complete."
                    % total_pid_count)

                complete = 0
                for pid in pids:
                    pid.wait()
                    complete += 1
                    click.echo("%s/%s concurrent processes are finished." %
                               (complete, total_pid_count))

            else:
                task = create_deletion_task(days, project_id, model, dtfield,
                                            order_by)
                _chunk_until_complete(task)

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
Пример #7
0
def cleanup(days, project, concurrency):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry.models import (
        Event, EventMapping, Group, GroupRuleStatus, GroupTagValue,
        LostPasswordHash, TagValue, GroupEmailThread,
    )

    # these models should be safe to delete without cascades, in order
    BULK_DELETES = (
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (TagValue, 'last_seen'),
        (GroupEmailThread, 'date'),
    )

    GENERIC_DELETES = (
        (Event, 'datetime'),
        (Group, 'last_seen'),
    )

    click.echo("Removing expired values for LostPasswordHash")
    LostPasswordHash.objects.filter(
        date_added__lte=timezone.now() - timedelta(hours=48)
    ).delete()

    project_id = None
    if project:
        click.echo("Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        click.echo("Removing old NodeStore values")
        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation", err=True)

    for model, dtfield in BULK_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project_id,
        ).execute()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    click.echo("Removing expired values for EventMapping")
    BulkDeleteQuery(
        model=EventMapping,
        dtfield='date_added',
        days=min(days, 7),
        project_id=project_id,
    ).execute()

    # Clean up FileBLob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    click.echo("Cleaning up unused FileBlob references")
    cleanup_unused_files()

    for model, dtfield in GENERIC_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project_id,
        ).execute_generic()
Пример #8
0
def cleanup(days, project, concurrency):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry.models import (
        Event,
        EventMapping,
        Group,
        GroupRuleStatus,
        GroupTagValue,
        LostPasswordHash,
        TagValue,
        GroupEmailThread,
    )

    # these models should be safe to delete without cascades, in order
    BULK_DELETES = (
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (TagValue, 'last_seen'),
        (GroupEmailThread, 'date'),
    )

    GENERIC_DELETES = (
        (Event, 'datetime'),
        (Group, 'last_seen'),
    )

    click.echo("Removing expired values for LostPasswordHash")
    LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                    timedelta(hours=48)).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        click.echo("Removing old NodeStore values")
        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation",
                       err=True)

    for model, dtfield in BULK_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project_id,
        ).execute()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    click.echo("Removing expired values for EventMapping")
    BulkDeleteQuery(
        model=EventMapping,
        dtfield='date_added',
        days=min(days, 7),
        project_id=project_id,
    ).execute()

    # Clean up FileBLob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    click.echo("Cleaning up unused FileBlob references")
    cleanup_unused_files()

    for model, dtfield in GENERIC_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project_id,
        ).execute_generic()
Пример #9
0
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    from threading import Thread
    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import deletions
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = (
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
        (models.GroupTagValue, 'last_seen', None),
        (models.TagValue, 'last_seen', None),
        (models.EventTag, 'date_added', 'date_added'),
    )

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', None),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                               timedelta(hours=48)).delete()

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=timezone.now()).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")
        else:
            cutoff = timezone.now() - timedelta(days=days)
            try:
                nodestore.cleanup(cutoff)
            except NotImplementedError:
                click.echo(
                    "NodeStore backend does not support cleanup operation",
                    err=True)

    for model, dtfield, order_by in BULK_QUERY_DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute()

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            query = {
                '{}__lte'.format(dtfield):
                (timezone.now() - timedelta(days=days)),
            }

            if project_id:
                if 'project' in model._meta.get_all_field_names():
                    query['project'] = project_id
                else:
                    query['project_id'] = project_id

            task = deletions.get(
                model=model,
                query=query,
                order_by=order_by,
                transaction_id=uuid4().hex,
            )

            def _chunk_until_complete(num_shards=None, shard_id=None):
                has_more = True
                while has_more:
                    has_more = task.chunk(num_shards=num_shards,
                                          shard_id=shard_id)

            if concurrency > 1:
                threads = []
                for shard_id in range(concurrency):
                    t = Thread(target=(
                        lambda shard_id=shard_id: _chunk_until_complete(
                            num_shards=concurrency, shard_id=shard_id)))
                    t.start()
                    threads.append(t)

                for t in threads:
                    t.join()
            else:
                _chunk_until_complete()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    if not silent:
        click.echo("Removing expired values for EventMapping")
    if is_filtered(models.EventMapping):
        if not silent:
            click.echo('>> Skipping EventMapping')
    else:
        BulkDeleteQuery(model=models.EventMapping,
                        dtfield='date_added',
                        days=min(days, 7),
                        project_id=project_id,
                        order_by='-date_added').execute()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
Пример #10
0
def cleanup(days, project, concurrency, silent, model):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    from threading import Thread
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry.models import (
        ApiGrant, ApiToken, Event, EventMapping, Group, GroupRuleStatus,
        GroupTagValue, LostPasswordHash, TagValue, GroupEmailThread,
    )

    models = {m.lower() for m in model}

    def is_filtered(model):
        if not models:
            return False
        return model.lower() not in models

    # these models should be safe to delete without cascades, in order
    BULK_DELETES = (
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (TagValue, 'last_seen'),
        (GroupEmailThread, 'date'),
    )

    GENERIC_DELETES = (
        (Event, 'datetime'),
        (Group, 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered('LostPasswordHash'):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    for model in [ApiGrant, ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(
                expires_at__lt=timezone.now()
            ).delete()

    project_id = None
    if project:
        click.echo("Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")
        if is_filtered('NodeStore'):
            if not silent:
                click.echo('>> Skipping NodeStore')
        else:
            cutoff = timezone.now() - timedelta(days=days)
            try:
                nodestore.cleanup(cutoff)
            except NotImplementedError:
                click.echo("NodeStore backend does not support cleanup operation", err=True)

    for model, dtfield in BULK_DELETES:
        if not silent:
            click.echo("Removing {model} for days={days} project={project}".format(
                model=model.__name__,
                days=days,
                project=project or '*',
            ))
        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
            ).execute()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    if not silent:
        click.echo("Removing expired values for EventMapping")
    if is_filtered('EventMapping'):
        if not silent:
            click.echo('>> Skipping EventMapping')
    else:
        query = BulkDeleteQuery(
            model=EventMapping,
            dtfield='date_added',
            days=min(days, 7),
            project_id=project_id,
        )
        if concurrency > 1:
            click.echo("Running concurrent %d threads" % concurrency)
            threads = []
            for shard_id in range(concurrency):
                t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id))
                t.start()
                threads.append(t)

            for t in threads:
                t.join()
            click.echo("OK! concurrent %d threads" % concurrency)
        else:
            query.execute_generic()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered('FileBlob'):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    for model, dtfield in GENERIC_DELETES:
        if not silent:
            click.echo("Removing {model} for days={days} project={project}".format(
                model=model.__name__,
                days=days,
                project=project or '*',
            ))
        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            query = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
            )
            if concurrency > 1:
                threads = []
                for shard_id in range(concurrency):
                    t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id))
                    t.start()
                    threads.append(t)

                for t in threads:
                    t.join()
            else:
                query.execute_generic()
Пример #11
0
def cleanup(days, project, concurrency, silent, model):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    from threading import Thread
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry.models import (
        ApiGrant, ApiToken, Event, EventMapping, Group, GroupRuleStatus,
        GroupTagValue, LostPasswordHash, TagValue, GroupEmailThread,
    )

    models = {m.lower() for m in model}

    def is_filtered(model):
        if not models:
            return False
        return model.lower() not in models

    # these models should be safe to delete without cascades, in order
    BULK_DELETES = (
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (TagValue, 'last_seen'),
        (GroupEmailThread, 'date'),
    )

    GENERIC_DELETES = (
        (Event, 'datetime'),
        (Group, 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered('LostPasswordHash'):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    for model in [ApiGrant, ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(
                expires_at__lt=timezone.now()
            ).delete()

    project_id = None
    if project:
        click.echo("Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")
        if is_filtered('NodeStore'):
            if not silent:
                click.echo('>> Skipping NodeStore')
        else:
            cutoff = timezone.now() - timedelta(days=days)
            try:
                nodestore.cleanup(cutoff)
            except NotImplementedError:
                click.echo("NodeStore backend does not support cleanup operation", err=True)

    for model, dtfield in BULK_DELETES:
        if not silent:
            click.echo("Removing {model} for days={days} project={project}".format(
                model=model.__name__,
                days=days,
                project=project or '*',
            ))
        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
            ).execute()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    if not silent:
        click.echo("Removing expired values for EventMapping")
    if is_filtered('EventMapping'):
        if not silent:
            click.echo('>> Skipping EventMapping')
    else:
        BulkDeleteQuery(
            model=EventMapping,
            dtfield='date_added',
            days=min(days, 7),
            project_id=project_id,
        ).execute()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered('FileBlob'):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    for model, dtfield in GENERIC_DELETES:
        if not silent:
            click.echo("Removing {model} for days={days} project={project}".format(
                model=model.__name__,
                days=days,
                project=project or '*',
            ))
        if is_filtered(model.__name__):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            query = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
            )
            if concurrency > 1:
                threads = []
                for shard_id in range(concurrency):
                    t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id))
                    t.start()
                    threads.append(t)

                for t in threads:
                    t.join()
            else:
                query.execute_generic()
Пример #12
0
def cleanup(days, project, concurrency):
    "Delete a portion of trailing data based on creation date."

    from datetime import timedelta
    from django.utils import timezone

    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry.models import (
        Event, EventMapping, Group, GroupRuleStatus, GroupTagValue,
        LostPasswordHash, TagValue, GroupEmailThread,
    )

    # these models should be safe to delete without cascades, in order
    BULK_DELETES = (
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (TagValue, 'last_seen'),
        (GroupEmailThread, 'date'),
    )

    GENERIC_DELETES = (
        (Event, 'datetime'),
        (Group, 'last_seen'),
    )

    click.echo("Removing expired values for LostPasswordHash")
    LostPasswordHash.objects.filter(
        date_added__lte=timezone.now() - timedelta(hours=48)
    ).delete()

    if project:
        click.echo("Bulk NodeStore deletion not available for project selection", err=True)
    else:
        click.echo("Removing old NodeStore values")
        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation", err=True)

    for model, dtfield in BULK_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project,
        ).execute()

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    click.echo("Removing expired values for EventMapping")
    BulkDeleteQuery(
        model=EventMapping,
        dtfield='date_added',
        days=min(days, 7),
        project_id=project,
    ).execute()

    for model, dtfield in GENERIC_DELETES:
        click.echo("Removing {model} for days={days} project={project}".format(
            model=model.__name__,
            days=days,
            project=project or '*',
        ))
        BulkDeleteQuery(
            model=model,
            dtfield=dtfield,
            days=days,
            project_id=project,
        ).execute_generic()