def test_merge_with_event_integrity(self): project = self.create_project() event1 = self.store_event( data={ "event_id": "a" * 32, "timestamp": iso_format(before_now(seconds=1)), "fingerprint": ["group-1"], "extra": { "foo": "bar" }, }, project_id=project.id, ) group1 = event1.group event2 = self.store_event( data={ "event_id": "b" * 32, "timestamp": iso_format(before_now(seconds=1)), "fingerprint": ["group-2"], "extra": { "foo": "baz" }, }, project_id=project.id, ) group2 = event2.group with self.tasks(): eventstream_state = eventstream.start_merge( project.id, [group1.id], group2.id) merge_groups([group1.id], group2.id) eventstream.end_merge(eventstream_state) assert not Group.objects.filter(id=group1.id).exists() event1 = eventstore.get_event_by_id(project.id, event1.event_id) assert event1.group_id == group2.id Event.objects.bind_nodes([event1], "data") assert event1.data["extra"]["foo"] == "bar" event2 = eventstore.get_event_by_id(project.id, event2.event_id) assert event2.group_id == group2.id Event.objects.bind_nodes([event2], "data") assert event2.data["extra"]["foo"] == "baz"
def test_unmerge(self): now = before_now(minutes=5).replace(microsecond=0, tzinfo=pytz.utc) def time_from_now(offset=0): return now + timedelta(seconds=offset) project = self.create_project() sequence = itertools.count(0) tag_values = itertools.cycle(["red", "green", "blue"]) user_values = itertools.cycle([{"id": 1}, {"id": 2}]) def create_message_event(template, parameters, environment, release, fingerprint="group1"): i = next(sequence) event_id = uuid.UUID(fields=(i, 0x0, 0x1000, 0x80, 0x80, 0x808080808080)).hex tags = [["color", next(tag_values)]] if release: tags.append(["sentry:release", release]) event = self.store_event( data={ "event_id": event_id, "message": template % parameters, "type": "default", "user": next(user_values), "tags": tags, "fingerprint": [fingerprint], "timestamp": iso_format(now + timedelta(seconds=i)), "environment": environment, "release": release, }, project_id=project.id, ) UserReport.objects.create( project_id=project.id, group_id=event.group.id, event_id=event_id, name="Log Hat", email="*****@*****.**", comments="Quack", ) features.record([event]) return event events = OrderedDict() for event in ( create_message_event( "This is message #%s.", i, environment="production", release="version" ) for i in xrange(10) ): events.setdefault(get_fingerprint(event), []).append(event) for event in ( create_message_event( "This is message #%s!", i, environment="production", release="version2", fingerprint="group2", ) for i in xrange(10, 16) ): events.setdefault(get_fingerprint(event), []).append(event) event = create_message_event( "This is message #%s!", 17, environment="staging", release="version3", fingerprint="group3", ) events.setdefault(get_fingerprint(event), []).append(event) merge_source, source, destination = list(Group.objects.all()) assert len(events) == 3 assert sum(map(len, events.values())) == 17 production_environment = Environment.objects.get( organization_id=project.organization_id, name="production" ) with self.tasks(): eventstream_state = eventstream.start_merge(project.id, [merge_source.id], source.id) merge_groups.delay([merge_source.id], source.id) eventstream.end_merge(eventstream_state) assert set( [ (gtv.value, gtv.times_seen) for gtv in tagstore.get_group_tag_values( project.id, source.id, production_environment.id, "color" ) ] ) == set([("red", 6), ("green", 5), ("blue", 5)]) similar_items = features.compare(source) assert len(similar_items) == 2 assert similar_items[0][0] == source.id assert similar_items[0][1]["message:message:character-shingles"] == 1.0 assert similar_items[1][0] == destination.id assert similar_items[1][1]["message:message:character-shingles"] < 1.0 with self.tasks(): eventstream_state = eventstream.start_unmerge( project.id, [list(events.keys())[0]], source.id, destination.id ) unmerge.delay( project.id, source.id, destination.id, [list(events.keys())[0]], None, batch_size=5 ) eventstream.end_unmerge(eventstream_state) assert ( list( Group.objects.filter(id=merge_source.id).values_list( "times_seen", "first_seen", "last_seen" ) ) == [] ) assert list( Group.objects.filter(id=source.id).values_list("times_seen", "first_seen", "last_seen") ) == [(6, time_from_now(10), time_from_now(15))] assert list( Group.objects.filter(id=destination.id).values_list( "times_seen", "first_seen", "last_seen" ) ) == [(11, time_from_now(0), time_from_now(16))] assert source.id != destination.id assert source.project == destination.project destination_event_ids = map(lambda event: event.event_id, list(events.values())[1]) assert set( UserReport.objects.filter(group_id=source.id).values_list("event_id", flat=True) ) == set(destination_event_ids) assert set( GroupHash.objects.filter(group_id=source.id).values_list("hash", flat=True) ) == set(itertools.islice(events.keys(), 2)) assert set( GroupRelease.objects.filter(group_id=source.id).values_list( "environment", "first_seen", "last_seen" ) ) == set([(u"production", time_from_now(10), time_from_now(15))]) assert set( [ (gtv.value, gtv.times_seen) for gtv in tagstore.get_group_tag_values( project.id, destination.id, production_environment.id, "color" ) ] ) == set([(u"red", 4), (u"green", 3), (u"blue", 3)]) destination_event_ids = map( lambda event: event.event_id, list(events.values())[0] + list(events.values())[2] ) assert set( UserReport.objects.filter(group_id=destination.id).values_list("event_id", flat=True) ) == set(destination_event_ids) assert set( GroupHash.objects.filter(group_id=destination.id).values_list("hash", flat=True) ) == set(itertools.islice(events.keys(), 2, 3)) assert set( GroupRelease.objects.filter(group_id=destination.id).values_list( "environment", "first_seen", "last_seen" ) ) == set( [ ("production", time_from_now(0), time_from_now(9)), ("staging", time_from_now(16), time_from_now(16)), ] ) assert set( [ (gtk.value, gtk.times_seen) for gtk in tagstore.get_group_tag_values( project.id, destination.id, production_environment.id, "color" ) ] ) == set([("red", 4), ("blue", 3), ("green", 3)]) rollup_duration = 3600 time_series = tsdb.get_range( tsdb.models.group, [source.id, destination.id], now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, ) environment_time_series = tsdb.get_range( tsdb.models.group, [source.id, destination.id], now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, environment_ids=[production_environment.id], ) def get_expected_series_values(rollup, events, function=None): if function is None: def function(aggregate, event): return (aggregate if aggregate is not None else 0) + 1 expected = {} for event in events: k = float((to_timestamp(event.datetime) // rollup_duration) * rollup_duration) expected[k] = function(expected.get(k), event) return expected def assert_series_contains(expected, actual, default=0): actual = dict(actual) for key, value in expected.items(): assert actual.get(key, 0) == value for key in set(actual.keys()) - set(expected.keys()): assert actual.get(key, 0) == default assert_series_contains( get_expected_series_values(rollup_duration, list(events.values())[1]), time_series[source.id], 0, ) assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[0] + list(events.values())[2] ), time_series[destination.id], 0, ) assert_series_contains( get_expected_series_values(rollup_duration, list(events.values())[1]), environment_time_series[source.id], 0, ) assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[0][:-1] + list(events.values())[2] ), environment_time_series[destination.id], 0, ) time_series = tsdb.get_distinct_counts_series( tsdb.models.users_affected_by_group, [source.id, destination.id], now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, ) environment_time_series = tsdb.get_distinct_counts_series( tsdb.models.users_affected_by_group, [source.id, destination.id], now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, environment_id=production_environment.id, ) def collect_by_user_tag(aggregate, event): aggregate = aggregate if aggregate is not None else set() aggregate.add(get_event_user_from_interface(event.data["user"]).tag_value) return aggregate for series in [time_series, environment_time_series]: assert_series_contains( { timestamp: len(values) for timestamp, values in get_expected_series_values( rollup_duration, list(events.values())[1], collect_by_user_tag ).items() }, series[source.id], ) assert_series_contains( { timestamp: len(values) for timestamp, values in get_expected_series_values( rollup_duration, list(events.values())[0] + list(events.values())[2], collect_by_user_tag, ).items() }, time_series[destination.id], ) def strip_zeroes(data): for group_id, series in data.items(): for _, values in series: for key, val in list(values.items()): if val == 0: values.pop(key) return data def collect_by_release(group, aggregate, event): aggregate = aggregate if aggregate is not None else {} release = event.get_tag("sentry:release") if not release: return aggregate release = GroupRelease.objects.get( group_id=group.id, environment=event.data["environment"], release_id=Release.objects.get( organization_id=project.organization_id, version=release ).id, ).id aggregate[release] = aggregate.get(release, 0) + 1 return aggregate items = {} for i in [source.id, destination.id]: items[i] = list(GroupRelease.objects.filter(group_id=i).values_list("id", flat=True)) time_series = strip_zeroes( tsdb.get_frequency_series( tsdb.models.frequent_releases_by_group, items, now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, ) ) assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[1], functools.partial(collect_by_release, source), ), time_series[source.id], {}, ) assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[0] + list(events.values())[2], functools.partial(collect_by_release, destination), ), time_series[destination.id], {}, ) items = {} for i in [source.id, destination.id]: items[i] = list(Environment.objects.all().values_list("id", flat=True)) time_series = strip_zeroes( tsdb.get_frequency_series( tsdb.models.frequent_environments_by_group, items, now - timedelta(seconds=rollup_duration), time_from_now(17), rollup_duration, ) ) def collect_by_environment(aggregate, event): aggregate = aggregate if aggregate is not None else {} environment = Environment.objects.get( organization_id=project.organization_id, name=event.data["environment"] ).id aggregate[environment] = aggregate.get(environment, 0) + 1 return aggregate assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[1], collect_by_environment ), time_series[source.id], {}, ) assert_series_contains( get_expected_series_values( rollup_duration, list(events.values())[0] + list(events.values())[2], collect_by_environment, ), time_series[destination.id], {}, ) source_similar_items = features.compare(source) assert source_similar_items[0] == ( source.id, { "exception:message:character-shingles": None, "exception:stacktrace:application-chunks": None, "exception:stacktrace:pairs": None, "message:message:character-shingles": 1.0, }, ) assert source_similar_items[1][0] == destination.id assert source_similar_items[1][1]["message:message:character-shingles"] < 1.0 destination_similar_items = features.compare(destination) assert destination_similar_items[0] == ( destination.id, { "exception:message:character-shingles": None, "exception:stacktrace:application-chunks": None, "exception:stacktrace:pairs": None, "message:message:character-shingles": 1.0, }, ) assert destination_similar_items[1][0] == source.id assert destination_similar_items[1][1]["message:message:character-shingles"] < 1.0
def merge_groups(from_object_ids=None, to_object_id=None, transaction_id=None, recursed=False, eventstream_state=None, **kwargs): # TODO(mattrobenolt): Write tests for all of this from sentry.models import ( Activity, Group, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, Environment, EventAttachment, UserReport, GroupRedirect, GroupMeta, get_group_with_redirect, ) if not (from_object_ids and to_object_id): logger.error("group.malformed.missing_params", extra={"transaction_id": transaction_id}) return False # Operate on one "from" group per task iteration. The task is recursed # until each group has been merged. from_object_id = from_object_ids[0] try: new_group, _ = get_group_with_redirect(to_object_id) except Group.DoesNotExist: logger.warn( "group.malformed.invalid_id", extra={ "transaction_id": transaction_id, "old_object_ids": from_object_ids }, ) return False if not recursed: logger.info( "merge.queued", extra={ "transaction_id": transaction_id, "new_group_id": new_group.id, "old_group_ids": from_object_ids, # TODO(jtcunning): figure out why these are full seq scans and/or alternative solution # 'new_event_id': getattr(new_group.event_set.order_by('-id').first(), 'id', None), # 'old_event_id': getattr(group.event_set.order_by('-id').first(), 'id', None), # 'new_hash_id': getattr(new_group.grouphash_set.order_by('-id').first(), 'id', None), # 'old_hash_id': getattr(group.grouphash_set.order_by('-id').first(), 'id', None), }, ) try: group = Group.objects.select_related("project").get(id=from_object_id) except Group.DoesNotExist: from_object_ids.remove(from_object_id) logger.warn( "group.malformed.invalid_id", extra={ "transaction_id": transaction_id, "old_object_id": from_object_id }, ) else: model_list = tuple(EXTRA_MERGE_MODELS) + ( Activity, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, EventAttachment, UserReport, GroupRedirect, GroupMeta, ) has_more = merge_objects(model_list, group, new_group, logger=logger, transaction_id=transaction_id) if not has_more: # There are no more objects to merge for *this* "from" group, remove it # from the list of "from" groups that are being merged, and finish the # work for this group. from_object_ids.remove(from_object_id) similarity.merge(group.project, new_group, [group], allow_unsafe=True) environment_ids = list( Environment.objects.filter(projects=group.project).values_list( "id", flat=True)) for model in [tsdb.models.group]: tsdb.merge( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) for model in [tsdb.models.users_affected_by_group]: tsdb.merge_distinct_counts( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) for model in [ tsdb.models.frequent_releases_by_group, tsdb.models.frequent_environments_by_group, ]: tsdb.merge_frequencies( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) previous_group_id = group.id with transaction.atomic(): GroupRedirect.create_for_group(group, new_group) group.delete() delete_logger.info( "object.delete.executed", extra={ "object_id": previous_group_id, "transaction_id": transaction_id, "model": Group.__name__, }, ) new_group.update( # TODO(dcramer): ideally these would be SQL clauses first_seen=min(group.first_seen, new_group.first_seen), last_seen=max(group.last_seen, new_group.last_seen), ) try: # it's possible to hit an out of range value for counters new_group.update( times_seen=F("times_seen") + group.times_seen, num_comments=F("num_comments") + group.num_comments, ) except DataError: pass if from_object_ids: # This task is recursed until `from_object_ids` is empty and all # "from" groups have merged into the `to_group_id`. merge_groups.delay( from_object_ids=from_object_ids, to_object_id=to_object_id, transaction_id=transaction_id, recursed=True, eventstream_state=eventstream_state, ) elif eventstream_state: # All `from_object_ids` have been merged! eventstream.end_merge(eventstream_state)
def merge_group( from_object_id=None, to_object_id=None, transaction_id=None, recursed=False, eventstream_state=None, **kwargs ): # TODO(mattrobenolt): Write tests for all of this from sentry.models import ( Activity, Group, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, Environment, EventMapping, Event, UserReport, GroupRedirect, GroupMeta, ) if not (from_object_id and to_object_id): logger.error( 'group.malformed.missing_params', extra={ 'transaction_id': transaction_id, } ) return try: group = Group.objects.get(id=from_object_id) except Group.DoesNotExist: logger.warn( 'group.malformed.invalid_id', extra={ 'transaction_id': transaction_id, 'old_object_id': from_object_id, } ) return try: new_group = Group.objects.get(id=to_object_id) except Group.DoesNotExist: logger.warn( 'group.malformed.invalid_id', extra={ 'transaction_id': transaction_id, 'old_object_id': from_object_id, } ) return if not recursed: logger.info( 'merge.queued', extra={ 'transaction_id': transaction_id, 'new_group_id': new_group.id, 'old_group_id': group.id, # TODO(jtcunning): figure out why these are full seq scans and/or alternative solution # 'new_event_id': getattr(new_group.event_set.order_by('-id').first(), 'id', None), # 'old_event_id': getattr(group.event_set.order_by('-id').first(), 'id', None), # 'new_hash_id': getattr(new_group.grouphash_set.order_by('-id').first(), 'id', None), # 'old_hash_id': getattr(group.grouphash_set.order_by('-id').first(), 'id', None), } ) model_list = tuple(EXTRA_MERGE_MODELS) + ( Activity, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, EventMapping, Event, UserReport, GroupRedirect, GroupMeta, ) has_more = merge_objects( model_list, group, new_group, logger=logger, transaction_id=transaction_id, ) if has_more: merge_group.delay( from_object_id=from_object_id, to_object_id=to_object_id, transaction_id=transaction_id, recursed=True, eventstream_state=eventstream_state, ) return features.merge(new_group, [group], allow_unsafe=True) environment_ids = list( Environment.objects.filter( projects=group.project ).values_list('id', flat=True) ) for model in [tsdb.models.group]: tsdb.merge( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None ) for model in [tsdb.models.users_affected_by_group]: tsdb.merge_distinct_counts( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) for model in [ tsdb.models.frequent_releases_by_group, tsdb.models.frequent_environments_by_group ]: tsdb.merge_frequencies( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) previous_group_id = group.id group.delete() delete_logger.info( 'object.delete.executed', extra={ 'object_id': previous_group_id, 'transaction_id': transaction_id, 'model': Group.__name__, } ) try: with transaction.atomic(): GroupRedirect.objects.create( group_id=new_group.id, previous_group_id=previous_group_id, ) except IntegrityError: pass new_group.update( # TODO(dcramer): ideally these would be SQL clauses first_seen=min(group.first_seen, new_group.first_seen), last_seen=max(group.last_seen, new_group.last_seen), ) try: # it's possible to hit an out of range value for counters new_group.update( times_seen=F('times_seen') + group.times_seen, num_comments=F('num_comments') + group.num_comments, ) except DataError: pass if eventstream_state: eventstream.end_merge(eventstream_state)
def merge_group( from_object_id=None, to_object_id=None, transaction_id=None, recursed=False, eventstream_state=None, **kwargs ): # TODO(mattrobenolt): Write tests for all of this from sentry.models import ( Activity, Group, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, Environment, EventMapping, Event, UserReport, GroupRedirect, GroupMeta, ) if not (from_object_id and to_object_id): logger.error( 'group.malformed.missing_params', extra={ 'transaction_id': transaction_id, } ) return try: group = Group.objects.get(id=from_object_id) except Group.DoesNotExist: logger.warn( 'group.malformed.invalid_id', extra={ 'transaction_id': transaction_id, 'old_object_id': from_object_id, } ) return try: new_group = Group.objects.get(id=to_object_id) except Group.DoesNotExist: logger.warn( 'group.malformed.invalid_id', extra={ 'transaction_id': transaction_id, 'old_object_id': from_object_id, } ) return if not recursed: logger.info( 'merge.queued', extra={ 'transaction_id': transaction_id, 'new_group_id': new_group.id, 'old_group_id': group.id, # TODO(jtcunning): figure out why these are full seq scans and/or alternative solution # 'new_event_id': getattr(new_group.event_set.order_by('-id').first(), 'id', None), # 'old_event_id': getattr(group.event_set.order_by('-id').first(), 'id', None), # 'new_hash_id': getattr(new_group.grouphash_set.order_by('-id').first(), 'id', None), # 'old_hash_id': getattr(group.grouphash_set.order_by('-id').first(), 'id', None), } ) model_list = tuple(EXTRA_MERGE_MODELS) + ( Activity, GroupAssignee, GroupEnvironment, GroupHash, GroupRuleStatus, GroupSubscription, EventMapping, Event, UserReport, GroupRedirect, GroupMeta, ) has_more = merge_objects( model_list, group, new_group, logger=logger, transaction_id=transaction_id, ) last_task = False if has_more: merge_group.delay( from_object_id=from_object_id, to_object_id=to_object_id, transaction_id=transaction_id, recursed=True, eventstream_state=eventstream_state, ) return else: last_task = True features.merge(new_group, [group], allow_unsafe=True) environment_ids = list( Environment.objects.filter( projects=group.project ).values_list('id', flat=True) ) for model in [tsdb.models.group]: tsdb.merge( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None ) for model in [tsdb.models.users_affected_by_group]: tsdb.merge_distinct_counts( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) for model in [ tsdb.models.frequent_releases_by_group, tsdb.models.frequent_environments_by_group ]: tsdb.merge_frequencies( model, new_group.id, [group.id], environment_ids=environment_ids if model in tsdb.models_with_environment_support else None, ) previous_group_id = group.id group.delete() delete_logger.info( 'object.delete.executed', extra={ 'object_id': previous_group_id, 'transaction_id': transaction_id, 'model': Group.__name__, } ) try: with transaction.atomic(): GroupRedirect.objects.create( group_id=new_group.id, previous_group_id=previous_group_id, ) except IntegrityError: pass new_group.update( # TODO(dcramer): ideally these would be SQL clauses first_seen=min(group.first_seen, new_group.first_seen), last_seen=max(group.last_seen, new_group.last_seen), ) try: # it's possible to hit an out of range value for counters new_group.update( times_seen=F('times_seen') + group.times_seen, num_comments=F('num_comments') + group.num_comments, ) except DataError: pass if last_task and eventstream_state: eventstream.end_merge(eventstream_state)