def test_chunked(): assert list(chunked(range(5), 5)) == [ [0, 1, 2, 3, 4], ] assert list(chunked(range(10), 4)) == [ [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], ]
def _query_tsdb_groups_chunked(func, issue_ids, start, stop, rollup): combined = {} for chunk in chunked(issue_ids, BATCH_SIZE): combined.update(func(tsdb.models.group, chunk, start, stop, rollup=rollup)) return combined
def get_event_counts(issue_ids, start, stop, rollup): combined = {} for chunk in chunked(issue_ids, BATCH_SIZE): combined.update( tsdb.get_sums(tsdb.models.group, chunk, start, stop, rollup=rollup)) return combined
def organizations(metrics, since, until): """ Fetch metrics for organizations. """ from django.utils import timezone from sentry.app import tsdb from sentry.models import Organization stdout = click.get_text_stream('stdout') stderr = click.get_text_stream('stderr') def aggregate(series): return sum(value for timestamp, value in series) metrics = OrderedDict( (name, getattr(tsdb.models, name)) for name in metrics) if not metrics: return if until is None: until = timezone.now() if since is None: since = until - timedelta(minutes=60) if until < since: raise click.ClickException( u'invalid time range provided: {} to {}'.format(since, until)) stderr.write( u'Dumping {} from {} to {}...\n'.format( ', '.join(metrics.keys()), since, until, ), ) objects = Organization.objects.all() for chunk in chunked(objects, 100): instances = OrderedDict((instance.pk, instance) for instance in chunk) results = {} for metric in metrics.values(): results[metric] = tsdb.get_range(metric, instances.keys(), since, until) for key, instance in six.iteritems(instances): values = [] for metric in metrics.values(): values.append(aggregate(results[metric][key])) stdout.write( u'{} {} {}\n'.format( instance.id, instance.slug, ' '.join(map(six.binary_type, values)), ), )
def organizations(metrics, since, until): """ Fetch metrics for organizations. """ from django.utils import timezone from sentry.app import tsdb from sentry.models import Organization stdout = click.get_text_stream('stdout') stderr = click.get_text_stream('stderr') def aggregate(series): return sum(value for timestamp, value in series) metrics = OrderedDict((name, getattr(tsdb.models, name)) for name in metrics) if not metrics: return if until is None: until = timezone.now() if since is None: since = until - timedelta(minutes=60) if until < since: raise click.ClickException('invalid time range provided: {} to {}'.format(since, until)) stderr.write( 'Dumping {} from {} to {}...\n'.format( ', '.join(metrics.keys()), since, until, ), ) objects = Organization.objects.all() for chunk in chunked(objects, 100): instances = OrderedDict((instance.pk, instance) for instance in chunk) results = {} for metric in metrics.values(): results[metric] = tsdb.get_range(metric, instances.keys(), since, until) for key, instance in six.iteritems(instances): values = [] for metric in metrics.values(): values.append(aggregate(results[metric][key])) stdout.write( '{} {} {}\n'.format( instance.id, instance.slug, ' '.join(map(six.binary_type, values)), ), )
def update_user_reports(**kwargs: Any) -> None: now = timezone.now() user_reports = UserReport.objects.filter( group_id__isnull=True, environment_id__isnull=True, date_added__gte=now - timedelta(days=1) ) # We do one query per project, just to avoid the small case that two projects have the same event ID project_map: Dict[int, Any] = {} for r in user_reports: project_map.setdefault(r.project_id, []).append(r) # Logging values total_reports = len(user_reports) reports_with_event = 0 updated_reports = 0 samples = None MAX_EVENTS = kwargs.get("max_events", 5000) for project_id, reports in project_map.items(): event_ids = [r.event_id for r in reports] report_by_event = {r.event_id: r for r in reports} events = [] for event_id_chunk in chunked(event_ids, MAX_EVENTS): snuba_filter = eventstore.Filter( project_ids=[project_id], event_ids=event_id_chunk, start=now - timedelta(days=2), end=now + timedelta(minutes=5), # Just to catch clock skew ) events_chunk = eventstore.get_events(filter=snuba_filter) events.extend(events_chunk) for event in events: report = report_by_event.get(event.event_id) if report: reports_with_event += 1 report.update(group_id=event.group_id, environment_id=event.get_environment().id) updated_reports += 1 if not samples and len(reports) <= 10: samples = { "project_id": project_id, "event_ids": event_ids, "reports_event_ids": {r.id: r.event_id for r in reports}, } logger.info( "update_user_reports.records_updated", extra={ "reports_to_update": total_reports, "reports_with_event": reports_with_event, "updated_reports": updated_reports, "samples": samples, }, )
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def band(n, value): assert len(value) % n == 0 return list(chunked(value, len(value) / n))
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results