def start_group_reprocessing(project_id, group_id, max_events=None, acting_user_id=None): from django.db import transaction with transaction.atomic(): group = models.Group.objects.get(id=group_id) original_status = group.status original_short_id = group.short_id group.status = models.GroupStatus.REPROCESSING # satisfy unique constraint of (project_id, short_id) # we manually tested that multiple groups with (project_id=1, # short_id=null) can exist in postgres group.short_id = None group.save() # Create a duplicate row that has the same attributes by nulling out # the primary key and saving group.pk = group.id = None new_group = group # rename variable just to avoid confusion del group new_group.status = original_status new_group.short_id = original_short_id # this will be incremented by the events that are reprocessed new_group.times_seen = 0 new_group.save() # This migrates all models that are associated with a group but not # directly with an event, i.e. everything but event attachments and user # reports. Those other updates are run per-event (in # post-process-forwarder) to not cause too much load on pg. for model in GROUP_MODELS_TO_MIGRATE: model.objects.filter(group_id=group_id).update(group_id=new_group.id) models.GroupRedirect.objects.create( organization_id=new_group.project.organization_id, group_id=new_group.id, previous_group_id=group_id, ) models.Activity.objects.create( type=models.Activity.REPROCESS, project=new_group.project, ident=six.text_type(group_id), group=new_group, user_id=acting_user_id, ) # Get event counts of issue (for all environments etc). This was copypasted # and simplified from groupserializer. event_count = snuba.aliased_query( aggregations=[["count()", "", "times_seen"]], # select dataset=snuba.Dataset.Events, # from conditions=[["group_id", "=", group_id], ["project_id", "=", project_id]], # where referrer="reprocessing2.start_group_reprocessing", )["data"][0]["times_seen"] if max_events is not None: event_count = min(event_count, max_events) key = _get_sync_counter_key(group_id) _get_sync_redis_client().setex(key, _REDIS_SYNC_TTL, event_count)
def __get_event_id_from_filter(self, filter=None, orderby=None): # NOQA columns = [Columns.EVENT_ID.value.alias, Columns.PROJECT_ID.value.alias] try: # This query uses the discover dataset to enable # getting events across both errors and transactions, which is # required when doing pagination in discover result = snuba.aliased_query( selected_columns=columns, conditions=filter.conditions, filter_keys=filter.filter_keys, start=filter.start, end=filter.end, limit=1, referrer="eventstore.get_next_or_prev_event_id", orderby=orderby, dataset=snuba.Dataset.Discover, ) except (snuba.QueryOutsideRetentionError, snuba.QueryOutsideGroupActivityError): # This can happen when the date conditions for paging # and the current event generate impossible conditions. return None if "error" in result or len(result["data"]) == 0: return None row = result["data"][0] return (six.text_type(row["project_id"]), six.text_type(row["event_id"]))
def _execute_seen_stats_query( self, item_list, start=None, end=None, conditions=None, environment_ids=None ): project_ids = list({item.project_id for item in item_list}) group_ids = [item.id for item in item_list] aggregations = [ ["count()", "", "times_seen"], ["min", "timestamp", "first_seen"], ["max", "timestamp", "last_seen"], ["uniq", "tags[sentry:user]", "count"], ] filters = {"project_id": project_ids, "group_id": group_ids} if self.environment_ids: filters["environment"] = self.environment_ids result = aliased_query( dataset=Dataset.Events, start=start, end=end, groupby=["group_id"], conditions=conditions, filter_keys=filters, aggregations=aggregations, referrer="serializers.GroupSerializerSnuba._execute_seen_stats_query", ) seen_data = { issue["group_id"]: fix_tag_value_data( dict(filter(lambda key: key[0] != "group_id", issue.items())) ) for issue in result["data"] } user_counts = {item_id: value["count"] for item_id, value in seen_data.items()} last_seen = {item_id: value["last_seen"] for item_id, value in seen_data.items()} if start or end or conditions: first_seen = {item_id: value["first_seen"] for item_id, value in seen_data.items()} times_seen = {item_id: value["times_seen"] for item_id, value in seen_data.items()} else: if environment_ids: first_seen = { ge["group_id"]: ge["first_seen__min"] for ge in GroupEnvironment.objects.filter( group_id__in=[item.id for item in item_list], environment_id__in=environment_ids, ) .values("group_id") .annotate(Min("first_seen")) } else: first_seen = {item.id: item.first_seen for item in item_list} times_seen = {item.id: item.times_seen for item in item_list} attrs = {} for item in item_list: attrs[item] = { "times_seen": times_seen.get(item.id, 0), "first_seen": first_seen.get(item.id), "last_seen": last_seen.get(item.id), "user_count": user_counts.get(item.id, 0), } return attrs
def get_group_seen_values_for_environments( self, project_ids, group_id_list, environment_ids, snuba_filters, start=None, end=None ): # Get the total times seen, first seen, and last seen across multiple environments filters = {"project_id": project_ids, "group_id": group_id_list} if environment_ids: filters["environment"] = environment_ids aggregations = [ ["count()", "", "times_seen"], ["min", SEEN_COLUMN, "first_seen"], ["max", SEEN_COLUMN, "last_seen"], ] result = snuba.aliased_query( dataset=snuba.Dataset.Events, start=start, end=end, groupby=["group_id"], conditions=snuba_filters, filter_keys=filters, aggregations=aggregations, referrer="tagstore.get_group_seen_values_for_environments", ) return { issue["group_id"]: fix_tag_value_data( dict(filter(lambda key: key[0] != "group_id", six.iteritems(issue))) ) for issue in result["data"] }
def __get_events( self, filter, additional_columns=None, orderby=None, limit=DEFAULT_LIMIT, offset=DEFAULT_OFFSET, referrer=None, should_bind_nodes=False, ): assert filter, "You must provide a filter" cols = self.__get_columns(additional_columns) orderby = orderby or DESC_ORDERING result = snuba.aliased_query( selected_columns=cols, start=filter.start, end=filter.end, conditions=filter.conditions, filter_keys=filter.filter_keys, orderby=orderby, limit=limit, offset=offset, referrer=referrer, dataset=snuba.Dataset.Events, ) if "error" not in result: events = [self.__make_event(evt) for evt in result["data"]] if should_bind_nodes: self.bind_nodes(events) return events return []
def get_groups_user_counts(self, project_ids, group_ids, environment_ids, snuba_filters, start=None, end=None): filters = {"project_id": project_ids, "group_id": group_ids} if environment_ids: filters["environment"] = environment_ids aggregations = [["uniq", "tags[sentry:user]", "count"]] result = snuba.aliased_query( dataset=snuba.Dataset.Events, start=start, end=end, groupby=["group_id"], conditions=snuba_filters, filter_keys=filters, aggregations=aggregations, referrer="tagstore.get_groups_user_counts", ) return defaultdict( int, {issue["group_id"]: issue["count"] for issue in result["data"]})
def start_group_reprocessing(project_id, group_id, max_events=None): from sentry.models.group import Group, GroupStatus from sentry.models.grouphash import GroupHash from django.db import transaction with transaction.atomic(): Group.objects.filter(id=group_id).update( status=GroupStatus.REPROCESSING) # Remove all grouphashes such that new events get sorted into a # different group. GroupHash.objects.filter(group_id=group_id).delete() # Get event counts of issue (for all environments etc). This was copypasted # and simplified from groupserializer. event_count = snuba.aliased_query( aggregations=[["count()", "", "times_seen"]], # select dataset=snuba.Dataset.Events, # from conditions=[["group_id", "=", group_id], ["project_id", "=", project_id]], # where referrer="reprocessing2.start_group_reprocessing", )["data"][0]["times_seen"] if max_events is not None: event_count = min(event_count, max_events) key = _get_sync_counter_key(group_id) _get_sync_redis_client().setex(key, _REDIS_SYNC_TTL, event_count)
def get_events( self, filter, additional_columns=None, orderby=None, limit=DEFAULT_LIMIT, offset=DEFAULT_OFFSET, referrer="eventstore.get_events", ): """ Get events from Snuba. """ assert filter, "You must provide a filter" cols = self.__get_columns(additional_columns) orderby = orderby or DESC_ORDERING result = snuba.aliased_query( selected_columns=cols, start=filter.start, end=filter.end, conditions=filter.conditions, filter_keys=filter.filter_keys, orderby=orderby, limit=limit, offset=offset, referrer=referrer, dataset=snuba.Dataset.Events, ) if "error" not in result: return [self.__make_event(evt) for evt in result["data"]] return []
def start_group_reprocessing(project_id, group_id, max_events=None, acting_user_id=None): from django.db import transaction with transaction.atomic(): group = models.Group.objects.get(id=group_id) original_status = group.status if original_status == models.GroupStatus.REPROCESSING: # This is supposed to be a rather unlikely UI race when two people # click reprocessing in the UI at the same time. # # During reprocessing the button is greyed out. raise RuntimeError( "Cannot reprocess group that is currently being reprocessed") original_short_id = group.short_id group.status = models.GroupStatus.REPROCESSING # satisfy unique constraint of (project_id, short_id) # we manually tested that multiple groups with (project_id=1, # short_id=null) can exist in postgres group.short_id = None group.save() # Create a duplicate row that has the same attributes by nulling out # the primary key and saving group.pk = group.id = None new_group = group # rename variable just to avoid confusion del group new_group.status = original_status new_group.short_id = original_short_id # this will be incremented by the events that are reprocessed new_group.times_seen = 0 new_group.save() # This migrates all models that are associated with a group but not # directly with an event, i.e. everything but event attachments and user # reports. Those other updates are run per-event (in # post-process-forwarder) to not cause too much load on pg. for model in GROUP_MODELS_TO_MIGRATE: model.objects.filter(group_id=group_id).update( group_id=new_group.id) # Get event counts of issue (for all environments etc). This was copypasted # and simplified from groupserializer. event_count = snuba.aliased_query( aggregations=[["count()", "", "times_seen"]], # select dataset=snuba.Dataset.Events, # from conditions=[["group_id", "=", group_id], ["project_id", "=", project_id]], # where referrer="reprocessing2.start_group_reprocessing", )["data"][0]["times_seen"] if max_events is not None: event_count = min(event_count, max_events) # Create activity on *old* group as that will serve the landing page for our # reprocessing status # # Later the activity is migrated to the new group where it is used to serve # the success message. models.Activity.objects.create( type=models.Activity.REPROCESS, project=new_group.project, ident=six.text_type(group_id), group_id=group_id, user_id=acting_user_id, data={ "eventCount": event_count, "oldGroupId": group_id, "newGroupId": new_group.id }, ) client = _get_sync_redis_client() client.setex(_get_sync_counter_key(group_id), _REDIS_SYNC_TTL, event_count) return new_group.id
def snuba_search( self, start, end, project_ids, environment_ids, sort_field, cursor=None, group_ids=None, limit=None, offset=0, get_sample=False, search_filters=None, ): """ Returns a tuple of: * a sorted list of (group_id, group_score) tuples sorted descending by score, * the count of total results (rows) available for this query. """ filters = {"project_id": project_ids} if environment_ids is not None: filters["environment"] = environment_ids if group_ids: filters["group_id"] = sorted(group_ids) conditions = [] having = [] for search_filter in search_filters: if ( # Don't filter on postgres fields here, they're not available search_filter.key.name in self.postgres_only_fields or # We special case date search_filter.key.name == "date"): continue converted_filter = convert_search_filter_to_snuba_query( search_filter) converted_filter = self._transform_converted_filter( search_filter, converted_filter, project_ids, environment_ids) if converted_filter is not None: # Ensure that no user-generated tags that clashes with aggregation_defs is added to having if search_filter.key.name in self.aggregation_defs and not search_filter.key.is_tag: having.append(converted_filter) else: conditions.append(converted_filter) extra_aggregations = self.dependency_aggregations.get(sort_field, []) required_aggregations = set([sort_field, "total"] + extra_aggregations) for h in having: alias = h[0] required_aggregations.add(alias) aggregations = [] for alias in required_aggregations: aggregations.append(self.aggregation_defs[alias] + [alias]) if cursor is not None: having.append( (sort_field, ">=" if cursor.is_prev else "<=", cursor.value)) selected_columns = [] if get_sample: query_hash = md5( json.dumps(conditions).encode("utf-8")).hexdigest()[:8] selected_columns.append(("cityHash64", ("'{}'".format(query_hash), "group_id"), "sample")) sort_field = "sample" orderby = [sort_field] referrer = "search_sample" else: # Get the top matching groups by score, i.e. the actual search results # in the order that we want them. orderby = [ "-{}".format(sort_field), "group_id", ] # ensure stable sort within the same score referrer = "search" snuba_results = snuba.aliased_query( dataset=self.dataset, start=start, end=end, selected_columns=selected_columns, groupby=["group_id"], conditions=conditions, having=having, filter_keys=filters, aggregations=aggregations, orderby=orderby, referrer=referrer, limit=limit, offset=offset, totals= True, # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only turbo=get_sample, # Turn off FINAL when in sampling mode sample=1, # Don't use clickhouse sampling, even when in turbo mode. condition_resolver=snuba.get_snuba_column_name, ) rows = snuba_results["data"] total = snuba_results["totals"]["total"] if not get_sample: metrics.timing("snuba.search.num_result_groups", len(rows)) return [(row["group_id"], row[sort_field]) for row in rows], total
def transform_aliases_and_query(**kwargs): """ Convert aliases in selected_columns, groupby, aggregation, conditions, orderby and arrayjoin fields to their internal Snuba format and post the query to Snuba. Convert back translated aliases before returning snuba results. :deprecated: This method is deprecated. You should use sentry.snuba.discover instead. """ arrayjoin_map = {"error": "exception_stacks", "stack": "exception_frames"} translated_columns = {} derived_columns = set() selected_columns = kwargs.get("selected_columns") groupby = kwargs.get("groupby") aggregations = kwargs.get("aggregations") conditions = kwargs.get("conditions") filter_keys = kwargs["filter_keys"] arrayjoin = kwargs.get("arrayjoin") orderby = kwargs.get("orderby") having = kwargs.get("having", []) dataset = Dataset.Events if selected_columns: for (idx, col) in enumerate(selected_columns): if isinstance(col, list): # if list, means there are potentially nested functions and need to # iterate and translate potential columns parse_columns_in_functions(col) selected_columns[idx] = col translated_columns[col[2]] = col[2] derived_columns.add(col[2]) else: name = get_snuba_column_name(col) selected_columns[idx] = name translated_columns[name] = col if groupby: for (idx, col) in enumerate(groupby): if col not in derived_columns: name = get_snuba_column_name(col) else: name = col groupby[idx] = name translated_columns[name] = col for aggregation in aggregations or []: derived_columns.add(aggregation[2]) if isinstance(aggregation[1], str): aggregation[1] = get_snuba_column_name(aggregation[1]) elif isinstance(aggregation[1], (set, tuple, list)): aggregation[1] = [ get_snuba_column_name(col) for col in aggregation[1] ] for col in list(filter_keys.keys()): name = get_snuba_column_name(col) filter_keys[name] = filter_keys.pop(col) if conditions: aliased_conditions = [] for condition in conditions: field = condition[0] if not isinstance(field, (list, tuple)) and field in derived_columns: having.append(condition) else: aliased_conditions.append(condition) kwargs["conditions"] = aliased_conditions if having: kwargs["having"] = having if orderby: orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby] translated_orderby = [] for field_with_order in orderby: field = field_with_order.lstrip("-") translated_orderby.append("{}{}".format( "-" if field_with_order.startswith("-") else "", field if field in derived_columns else get_snuba_column_name(field), )) kwargs["orderby"] = translated_orderby kwargs["arrayjoin"] = arrayjoin_map.get(arrayjoin, arrayjoin) kwargs["dataset"] = dataset result = aliased_query(**kwargs) snuba_filter = eventstore.Filter( rollup=kwargs.get("rollup"), start=kwargs.get("start"), end=kwargs.get("end"), orderby=kwargs.get("orderby"), ) return transform_data(result, translated_columns, snuba_filter)
def __get_events( self, filter, # NOQA orderby=None, limit=DEFAULT_LIMIT, offset=DEFAULT_OFFSET, referrer=None, should_bind_nodes=False, ): assert filter, "You must provide a filter" # NOQA cols = self.__get_columns() orderby = orderby or DESC_ORDERING # This is an optimization for the Group.filter_by_event_id query where we # have a single event ID and want to check all accessible projects for a # direct hit. In this case it's usually faster to go to nodestore first. if ( filter.event_ids and filter.project_ids and len(filter.event_ids) * len(filter.project_ids) < min(limit, NODESTORE_LIMIT) and offset == 0 and should_bind_nodes ): event_list = [ Event(project_id=project_id, event_id=event_id) for event_id in filter.event_ids for project_id in filter.project_ids ] self.bind_nodes(event_list) nodestore_events = [event for event in event_list if len(event.data)] if nodestore_events: event_ids = {event.event_id for event in nodestore_events} project_ids = {event.project_id for event in nodestore_events} start = min(event.datetime for event in nodestore_events) end = max(event.datetime for event in nodestore_events) + timedelta(seconds=1) result = snuba.aliased_query( selected_columns=cols, start=start, end=end, conditions=filter.conditions, filter_keys={"project_id": project_ids, "event_id": event_ids}, orderby=orderby, limit=len(nodestore_events), offset=DEFAULT_OFFSET, referrer=referrer, dataset=snuba.Dataset.Events, ) if "error" not in result: events = [self.__make_event(evt) for evt in result["data"]] # Bind previously fetched node data nodestore_dict = { (e.event_id, e.project_id): e.data.data for e in nodestore_events } for event in events: node_data = nodestore_dict[(event.event_id, event.project_id)] event.data.bind_data(node_data) return events return [] result = snuba.aliased_query( selected_columns=cols, start=filter.start, end=filter.end, conditions=filter.conditions, filter_keys=filter.filter_keys, orderby=orderby, limit=limit, offset=offset, referrer=referrer, dataset=snuba.Dataset.Events, ) if "error" not in result: events = [self.__make_event(evt) for evt in result["data"]] if should_bind_nodes: self.bind_nodes(events) return events return []
def snuba_search( self, start: datetime, end: datetime, project_ids: Sequence[int], environment_ids: Sequence[int], sort_field: str, organization_id: int, cursor: Optional[Cursor] = None, group_ids: Optional[Sequence[int]] = None, limit: Optional[int] = None, offset: int = 0, get_sample: bool = False, search_filters: Optional[Sequence[SearchFilter]] = None, ) -> Tuple[List[Tuple[int, Any]], int]: """ Returns a tuple of: * a sorted list of (group_id, group_score) tuples sorted descending by score, * the count of total results (rows) available for this query. """ filters = {"project_id": project_ids} environments = None if environment_ids is not None: filters["environment"] = environment_ids environments = list( Environment.objects.filter(organization_id=organization_id, id__in=environment_ids).values_list( "name", flat=True)) if group_ids: filters["group_id"] = sorted(group_ids) conditions = [] having = [] for search_filter in search_filters: if ( # Don't filter on postgres fields here, they're not available search_filter.key.name in self.postgres_only_fields or # We special case date search_filter.key.name == "date"): continue converted_filter = convert_search_filter_to_snuba_query( search_filter, params={ "organization_id": organization_id, "project_id": project_ids, "environment": environments, }, ) converted_filter = self._transform_converted_filter( search_filter, converted_filter, project_ids, environment_ids) if converted_filter is not None: # Ensure that no user-generated tags that clashes with aggregation_defs is added to having if search_filter.key.name in self.aggregation_defs and not search_filter.key.is_tag: having.append(converted_filter) else: conditions.append(converted_filter) extra_aggregations = self.dependency_aggregations.get(sort_field, []) required_aggregations = set([sort_field, "total"] + extra_aggregations) for h in having: alias = h[0] required_aggregations.add(alias) aggregations = [] for alias in required_aggregations: aggregation = self.aggregation_defs[alias] if callable(aggregation): # TODO: If we want to expand this pattern we should probably figure out # more generic things to pass here. aggregation = aggregation(start, end) aggregations.append(aggregation + [alias]) if cursor is not None: having.append( (sort_field, ">=" if cursor.is_prev else "<=", cursor.value)) selected_columns = [] if get_sample: query_hash = md5( json.dumps(conditions).encode("utf-8")).hexdigest()[:8] selected_columns.append( ["cityHash64", [f"'{query_hash}'", "group_id"], "sample"]) sort_field = "sample" orderby = [sort_field] referrer = "search_sample" else: # Get the top matching groups by score, i.e. the actual search results # in the order that we want them. orderby = [ f"-{sort_field}", "group_id", ] # ensure stable sort within the same score referrer = "search" snuba_results = snuba.aliased_query( dataset=self.dataset, start=start, end=end, selected_columns=selected_columns, groupby=["group_id"], conditions=conditions, having=having, filter_keys=filters, aggregations=aggregations, orderby=orderby, referrer=referrer, limit=limit, offset=offset, totals= True, # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only turbo=get_sample, # Turn off FINAL when in sampling mode sample=1, # Don't use clickhouse sampling, even when in turbo mode. condition_resolver=snuba.get_snuba_column_name, ) rows = snuba_results["data"] total = snuba_results["totals"]["total"] if not get_sample: metrics.timing("snuba.search.num_result_groups", len(rows)) return [(row["group_id"], row[sort_field]) for row in rows], total