Exemplo n.º 1
0
    def save(self,
             project_id,
             raw=False,
             assume_normalized=False,
             cache_key=None):
        """
        After normalizing and processing an event, save adjacent models such as
        releases and environments to postgres and write the event into
        eventstream. From there it will be picked up by Snuba and
        post-processing.

        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        with metrics.timer("event_manager.save.project.get_from_cache"):
            project = Project.objects.get_from_cache(id=project_id)

        with metrics.timer("event_manager.save.organization.get_from_cache"):
            project._organization_cache = Organization.objects.get_from_cache(
                id=project.organization_id)

        job = {"data": self._data, "project_id": project_id, "raw": raw}
        jobs = [job]
        projects = {project.id: project}

        _pull_out_data(jobs, projects)

        # Right now the event type is the signal to skip the group. This
        # is going to change a lot.
        if job["event"].get_event_type() == "transaction":
            issueless_event = True
        else:
            issueless_event = False

        _get_or_create_release_many(jobs, projects)

        # XXX: remove
        if job["dist"] and job["release"]:
            job["dist"] = job["release"].add_dist(job["dist"],
                                                  job["event"].datetime)
            # dont allow a conflicting 'dist' tag
            pop_tag(job["data"], "dist")
            set_tag(job["data"], "sentry:dist", job["dist"].name)
        else:
            job["dist"] = None

        _get_event_user_many(jobs, projects)

        with metrics.timer("event_manager.load_grouping_config"):
            # At this point we want to normalize the in_app values in case the
            # clients did not set this appropriately so far.
            grouping_config = load_grouping_config(
                get_grouping_config_dict_for_event_data(job["data"], project))

        with metrics.timer("event_manager.normalize_stacktraces_for_grouping"):
            normalize_stacktraces_for_grouping(job["data"], grouping_config)

        _derive_plugin_tags_many(jobs, projects)
        _derive_interface_tags_many(jobs)

        with metrics.timer("event_manager.apply_server_fingerprinting"):
            # The active grouping config was put into the event in the
            # normalize step before.  We now also make sure that the
            # fingerprint was set to `'{{ default }}' just in case someone
            # removed it from the payload.  The call to get_hashes will then
            # look at `grouping_config` to pick the right parameters.
            job["data"]["fingerprint"] = job["data"].get("fingerprint") or [
                "{{ default }}"
            ]
            apply_server_fingerprinting(
                job["data"], get_fingerprinting_config_for_project(project))

        with metrics.timer("event_manager.event.get_hashes"):
            # Here we try to use the grouping config that was requested in the
            # event.  If that config has since been deleted (because it was an
            # experimental grouping config) we fall back to the default.
            try:
                hashes = job["event"].get_hashes()
            except GroupingConfigNotFound:
                job["data"][
                    "grouping_config"] = get_grouping_config_dict_for_project(
                        project)
                hashes = job["event"].get_hashes()

        job["data"]["hashes"] = hashes

        _materialize_metadata_many(jobs)

        job["received_timestamp"] = received_timestamp = job["event"].data.get(
            "received") or float(job["event"].datetime.strftime("%s"))

        if not issueless_event:
            # The group gets the same metadata as the event when it's flushed but
            # additionally the `last_received` key is set.  This key is used by
            # _save_aggregate.
            group_metadata = dict(job["materialized_metadata"])
            group_metadata["last_received"] = received_timestamp
            kwargs = {
                "platform": job["platform"],
                "message": job["event"].search_message,
                "culprit": job["culprit"],
                "logger": job["logger_name"],
                "level": LOG_LEVELS_MAP.get(job["level"]),
                "last_seen": job["event"].datetime,
                "first_seen": job["event"].datetime,
                "active_at": job["event"].datetime,
                "data": group_metadata,
            }

            if job["release"]:
                kwargs["first_release"] = job["release"]

            try:
                job["group"], job["is_new"], job[
                    "is_regression"] = _save_aggregate(event=job["event"],
                                                       hashes=hashes,
                                                       release=job["release"],
                                                       **kwargs)
            except HashDiscarded:
                event_discarded.send_robust(project=project,
                                            sender=EventManager)

                metrics.incr(
                    "events.discarded",
                    skip_internal=True,
                    tags={
                        "organization_id": project.organization_id,
                        "platform": job["platform"]
                    },
                )
                raise
            job["event"].group = job["group"]
        else:
            job["group"] = None
            job["is_new"] = False
            job["is_regression"] = False

        _send_event_saved_signal_many(jobs, projects)

        # store a reference to the group id to guarantee validation of isolation
        # XXX(markus): No clue what this does
        job["event"].data.bind_ref(job["event"])

        _get_or_create_environment_many(jobs, projects)

        if job["group"]:
            group_environment, job[
                "is_new_group_environment"] = GroupEnvironment.get_or_create(
                    group_id=job["group"].id,
                    environment_id=job["environment"].id,
                    defaults={"first_release": job["release"] or None},
                )
        else:
            job["is_new_group_environment"] = False

        _get_or_create_release_associated_models(jobs, projects)

        if job["release"] and job["group"]:
            job["grouprelease"] = GroupRelease.get_or_create(
                group=job["group"],
                release=job["release"],
                environment=job["environment"],
                datetime=job["event"].datetime,
            )

        _tsdb_record_all_metrics(jobs)

        if job["group"]:
            UserReport.objects.filter(project=project,
                                      event_id=job["event"].event_id).update(
                                          group=job["group"],
                                          environment=job["environment"])

        # Enusre the _metrics key exists. This is usually created during
        # and prefilled with ingestion sizes.
        event_metrics = job["event"].data.get("_metrics") or {}
        job["event"].data["_metrics"] = event_metrics

        # Capture the actual size that goes into node store.
        event_metrics["bytes.stored.event"] = len(
            json.dumps(dict(job["event"].data.items())))

        if not issueless_event:
            # Load attachments first, but persist them at the very last after
            # posting to eventstream to make sure all counters and eventstream are
            # incremented for sure.
            attachments = get_attachments(cache_key, job["event"])
            for attachment in attachments:
                key = "bytes.stored.%s" % (attachment.type, )
                event_metrics[key] = (event_metrics.get(key) or 0) + len(
                    attachment.data)

        _nodestore_save_many(jobs)

        if job["release"] and not issueless_event:
            if job["is_new"]:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {
                        "release_id": job["release"].id,
                        "project_id": project.id
                    },
                )
            if job["is_new_group_environment"]:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": job["release"].id,
                        "environment_id": job["environment"].id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=job["event"].datetime)
                first_event_received.send_robust(project=project,
                                                 event=job["event"],
                                                 sender=Project)

        _eventstream_insert_many(jobs)

        if not issueless_event:
            # Do this last to ensure signals get emitted even if connection to the
            # file store breaks temporarily.
            save_attachments(attachments, job["event"])

        metric_tags = {"from_relay": "_relay_processed" in job["data"]}

        metrics.timing("events.latency",
                       received_timestamp - job["recorded_timestamp"],
                       tags=metric_tags)
        metrics.timing("events.size.data.post_save",
                       job["event"].size,
                       tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(job["data"].get("errors") or ()),
            tags=metric_tags,
        )

        self._data = job["event"].data.data
        return job["event"]
Exemplo n.º 2
0
    def validate_data(self, project, data):
        # TODO(dcramer): move project out of the data packet
        data['project'] = project.id

        data['errors'] = []

        if data.get('culprit'):
            if not isinstance(data['culprit'], six.string_types):
                raise APIForbidden('Invalid value for culprit')

        if not data.get('event_id'):
            data['event_id'] = uuid.uuid4().hex
        elif not isinstance(data['event_id'], six.string_types):
            raise APIForbidden('Invalid value for event_id')

        if len(data['event_id']) > 32:
            self.log.debug(
                'Discarded value for event_id due to length (%d chars)',
                len(data['event_id']))
            data['errors'].append({
                'type': EventError.VALUE_TOO_LONG,
                'name': 'event_id',
                'value': data['event_id'],
            })
            data['event_id'] = uuid.uuid4().hex
        elif not is_event_id(data['event_id']):
            self.log.debug('Discarded invalid value for event_id: %r',
                           data['event_id'],
                           exc_info=True)
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'event_id',
                'value': data['event_id'],
            })
            data['event_id'] = uuid.uuid4().hex

        if 'timestamp' in data:
            try:
                self._process_data_timestamp(data)
            except InvalidTimestamp as e:
                self.log.debug('Discarded invalid value for timestamp: %r',
                               data['timestamp'],
                               exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'timestamp',
                    'value': data['timestamp'],
                })
                del data['timestamp']

        if 'fingerprint' in data:
            try:
                self._process_fingerprint(data)
            except InvalidFingerprint as e:
                self.log.debug('Discarded invalid value for fingerprint: %r',
                               data['fingerprint'],
                               exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'fingerprint',
                    'value': data['fingerprint'],
                })
                del data['fingerprint']

        if 'platform' not in data or data['platform'] not in VALID_PLATFORMS:
            data['platform'] = 'other'

        if data.get('modules') and type(data['modules']) != dict:
            self.log.debug('Discarded invalid type for modules: %s',
                           type(data['modules']))
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'modules',
                'value': data['modules'],
            })
            del data['modules']

        if data.get('extra') is not None and type(data['extra']) != dict:
            self.log.debug('Discarded invalid type for extra: %s',
                           type(data['extra']))
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'extra',
                'value': data['extra'],
            })
            del data['extra']

        if data.get('tags') is not None:
            if type(data['tags']) == dict:
                data['tags'] = list(data['tags'].items())
            elif not isinstance(data['tags'], (list, tuple)):
                self.log.debug('Discarded invalid type for tags: %s',
                               type(data['tags']))
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'tags',
                    'value': data['tags'],
                })
                del data['tags']

        if data.get('tags'):
            # remove any values which are over 32 characters
            tags = []
            for pair in data['tags']:
                try:
                    k, v = pair
                except ValueError:
                    self.log.debug('Discarded invalid tag value: %r', pair)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not isinstance(k, six.string_types):
                    try:
                        k = six.text_type(k)
                    except Exception:
                        self.log.debug('Discarded invalid tag key: %r',
                                       type(k))
                        data['errors'].append({
                            'type': EventError.INVALID_DATA,
                            'name': 'tags',
                            'value': pair,
                        })
                        continue

                if not isinstance(v, six.string_types):
                    try:
                        v = six.text_type(v)
                    except Exception:
                        self.log.debug('Discarded invalid tag value: %s=%r', k,
                                       type(v))
                        data['errors'].append({
                            'type': EventError.INVALID_DATA,
                            'name': 'tags',
                            'value': pair,
                        })
                        continue

                if len(k) > MAX_TAG_KEY_LENGTH or len(
                        v) > MAX_TAG_VALUE_LENGTH:
                    self.log.debug('Discarded invalid tag: %s=%s', k, v)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                # support tags with spaces by converting them
                k = k.replace(' ', '-')

                if tagstore.is_reserved_key(k):
                    self.log.debug('Discarding reserved tag key: %s', k)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not tagstore.is_valid_key(k):
                    self.log.debug('Discarded invalid tag key: %s', k)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not tagstore.is_valid_value(v):
                    self.log.debug('Discard invalid tag value: %s', v)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                tags.append((k, v))
            data['tags'] = tags

        for k in list(iter(data)):
            if k in CLIENT_RESERVED_ATTRS:
                continue

            value = data.pop(k)

            if not value:
                self.log.debug('Ignored empty interface value: %s', k)
                continue

            try:
                interface = get_interface(k)
            except ValueError:
                self.log.debug('Ignored unknown attribute: %s', k)
                data['errors'].append({
                    'type': EventError.INVALID_ATTRIBUTE,
                    'name': k,
                })
                continue

            if type(value) != dict:
                # HACK(dcramer): the exception/breadcrumbs interface supports a
                # list as the value. We should change this in a new protocol
                # version.
                if type(value) in (list, tuple):
                    value = {'values': value}
                else:
                    self.log.debug('Invalid parameter for value: %s (%r)', k,
                                   type(value))
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': k,
                        'value': value,
                    })
                    continue

            try:
                inst = interface.to_python(value)
                data[inst.get_path()] = inst.to_json()
            except Exception as e:
                if isinstance(e, InterfaceValidationError):
                    log = self.log.debug
                else:
                    log = self.log.error
                log('Discarded invalid value for interface: %s (%r)',
                    k,
                    value,
                    exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': k,
                    'value': value,
                })

        # TODO(dcramer): ideally this logic would happen in normalize, but today
        # we don't do "validation" there (create errors)

        # message is coerced to an interface, as its used for pure
        # index of searchable strings
        # See GH-3248
        message = data.pop('message', None)
        if message:
            if 'sentry.interfaces.Message' not in data:
                value = {
                    'message': message,
                }
            elif not data['sentry.interfaces.Message'].get('formatted'):
                value = data['sentry.interfaces.Message']
                value['formatted'] = message
            else:
                value = None

            if value is not None:
                k = 'sentry.interfaces.Message'
                interface = get_interface(k)
                try:
                    inst = interface.to_python(value)
                    data[inst.get_path()] = inst.to_json()
                except Exception as e:
                    if isinstance(e, InterfaceValidationError):
                        log = self.log.debug
                    else:
                        log = self.log.error
                    log('Discarded invalid value for interface: %s (%r)',
                        k,
                        value,
                        exc_info=True)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': k,
                        'value': value,
                    })

        level = data.get('level') or DEFAULT_LOG_LEVEL
        if isinstance(level, six.string_types) and not level.isdigit():
            # assume it's something like 'warning'
            try:
                data['level'] = LOG_LEVELS_MAP[level]
            except KeyError as e:
                self.log.debug('Discarded invalid logger value: %s', level)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'level',
                    'value': level,
                })
                data['level'] = LOG_LEVELS_MAP.get(DEFAULT_LOG_LEVEL,
                                                   DEFAULT_LOG_LEVEL)

        if data.get('release'):
            data['release'] = six.text_type(data['release'])
            if len(data['release']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'release',
                    'value': data['release'],
                })
                del data['release']

        if data.get('dist'):
            data['dist'] = six.text_type(data['dist']).strip()
            if not data.get('release'):
                data['dist'] = None
            elif len(data['dist']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'dist',
                    'value': data['dist'],
                })
                del data['dist']
            elif _dist_re.match(data['dist']) is None:
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'dist',
                    'value': data['dist'],
                })
                del data['dist']

        if data.get('environment'):
            data['environment'] = six.text_type(data['environment'])
            if len(data['environment']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'environment',
                    'value': data['environment'],
                })
                del data['environment']

        if data.get('time_spent'):
            try:
                data['time_spent'] = int(data['time_spent'])
            except (ValueError, TypeError):
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'time_spent',
                    'value': data['time_spent'],
                })
                del data['time_spent']
            else:
                if data['time_spent'] > BoundedIntegerField.MAX_VALUE:
                    data['errors'].append({
                        'type': EventError.VALUE_TOO_LONG,
                        'name': 'time_spent',
                        'value': data['time_spent'],
                    })
                    del data['time_spent']

        return data
Exemplo n.º 3
0
    def normalize(self, request_env=None):
        request_env = request_env or {}
        data = self.data
        errors = data['errors'] = []

        # Ignore event meta data for now.
        data.pop('_meta', None)

        # Before validating with a schema, attempt to cast values to their desired types
        # so that the schema doesn't have to take every type variation into account.
        text = six.text_type
        fp_types = six.string_types + six.integer_types + (float, )

        def to_values(v):
            return {'values': v} if v and isinstance(v, (tuple, list)) else v

        def stringify(f):
            if isinstance(f, float):
                return text(int(f)) if abs(f) < (1 << 53) else None
            return text(f)

        casts = {
            'environment': lambda v: text(v) if v is not None else v,
            'fingerprint': lambda v: list(x for x in map(stringify, v) if x is not None) if isinstance(v, list) and all(isinstance(f, fp_types) for f in v) else v,
            'release': lambda v: text(v) if v is not None else v,
            'dist': lambda v: text(v).strip() if v is not None else v,
            'time_spent': lambda v: int(v) if v is not None else v,
            'tags': lambda v: [(text(v_k).replace(' ', '-').strip(), text(v_v).strip()) for (v_k, v_v) in dict(v).items()],
            'timestamp': lambda v: process_timestamp(v),
            'platform': lambda v: v if v in VALID_PLATFORMS else 'other',
            'logentry': lambda v: v if isinstance(v, dict) else {'message': v},

            # These can be sent as lists and need to be converted to {'values': [...]}
            'exception': to_values,
            'breadcrumbs': to_values,
            'threads': to_values,
        }

        for c in casts:
            if c in data:
                try:
                    data[c] = casts[c](data[c])
                except InvalidTimestamp as it:
                    errors.append({'type': it.args[0], 'name': c, 'value': data[c]})
                    del data[c]
                except Exception as e:
                    errors.append({'type': EventError.INVALID_DATA, 'name': c, 'value': data[c]})
                    del data[c]

        # raw 'message' is coerced to the Message interface, as its used for pure index of
        # searchable strings. If both a raw 'message' and a Message interface exist, try and
        # add the former as the 'formatted' attribute of the latter.
        # See GH-3248
        msg_str = data.pop('message', None)
        if msg_str:
            msg_if = data.get('logentry')
            msg_meta = data.get('_meta', {}).get('message')

            if not msg_if:
                msg_if = data['logentry'] = {'message': msg_str}
                if msg_meta:
                    data.setdefault('_meta', {}).setdefault('logentry', {})['message'] = msg_meta

            if msg_if.get('message') != msg_str:
                if not msg_if.get('formatted'):
                    msg_if['formatted'] = msg_str
                    if msg_meta:
                        data.setdefault('_meta', {}).setdefault(
                            'logentry', {})['formatted'] = msg_meta

        # Fill in ip addresses marked as {{auto}}
        client_ip = request_env.get('client_ip')
        if client_ip:
            if get_path(data, ['sentry.interfaces.Http', 'env', 'REMOTE_ADDR']) == '{{auto}}':
                data['sentry.interfaces.Http']['env']['REMOTE_ADDR'] = client_ip

            if get_path(data, ['request', 'env', 'REMOTE_ADDR']) == '{{auto}}':
                data['request']['env']['REMOTE_ADDR'] = client_ip

            if get_path(data, ['sentry.interfaces.User', 'ip_address']) == '{{auto}}':
                data['sentry.interfaces.User']['ip_address'] = client_ip

            if get_path(data, ['user', 'ip_address']) == '{{auto}}':
                data['user']['ip_address'] = client_ip

        # Validate main event body and tags against schema.
        # XXX(ja): jsonschema does not like CanonicalKeyDict, so we need to pass
        #          in the inner data dict.
        is_valid, event_errors = validate_and_default_interface(data.data, 'event')
        errors.extend(event_errors)
        if 'tags' in data:
            is_valid, tag_errors = validate_and_default_interface(data['tags'], 'tags', name='tags')
            errors.extend(tag_errors)

        # Validate interfaces
        for k in list(iter(data)):
            if k in CLIENT_RESERVED_ATTRS:
                continue

            value = data.pop(k)

            if not value:
                self.logger.debug('Ignored empty interface value: %s', k)
                continue

            try:
                interface = get_interface(k)
            except ValueError:
                self.logger.debug('Ignored unknown attribute: %s', k)
                errors.append({'type': EventError.INVALID_ATTRIBUTE, 'name': k})
                continue

            try:
                inst = interface.to_python(value)
                data[inst.get_path()] = inst.to_json()
            except Exception as e:
                log = self.logger.debug if isinstance(
                    e, InterfaceValidationError) else self.logger.error
                log('Discarded invalid value for interface: %s (%r)', k, value, exc_info=True)
                errors.append({'type': EventError.INVALID_DATA, 'name': k, 'value': value})

        # Additional data coercion and defaulting
        level = data.get('level') or DEFAULT_LOG_LEVEL
        if isinstance(level, int) or (isinstance(level, six.string_types) and level.isdigit()):
            level = LOG_LEVELS.get(int(level), DEFAULT_LOG_LEVEL)
        data['level'] = LOG_LEVELS_MAP.get(level, LOG_LEVELS_MAP[DEFAULT_LOG_LEVEL])

        if data.get('dist') and not data.get('release'):
            data['dist'] = None

        timestamp = data.get('timestamp')
        if not timestamp:
            timestamp = timezone.now()

        # TODO (alex) can this all be replaced by utcnow?
        # it looks like the only time that this would even be hit is when timestamp
        # is not defined, as the earlier process_timestamp already converts existing
        # timestamps to floats.
        if isinstance(timestamp, datetime):
            # We must convert date to local time so Django doesn't mess it up
            # based on TIME_ZONE
            if settings.TIME_ZONE:
                if not timezone.is_aware(timestamp):
                    timestamp = timestamp.replace(tzinfo=timezone.utc)
            elif timezone.is_aware(timestamp):
                timestamp = timestamp.replace(tzinfo=None)
            timestamp = float(timestamp.strftime('%s'))

        data['timestamp'] = timestamp
        data['received'] = float(timezone.now().strftime('%s'))

        data.setdefault('checksum', None)
        data.setdefault('culprit', None)
        data.setdefault('dist', None)
        data.setdefault('environment', None)
        data.setdefault('extra', {})
        data.setdefault('fingerprint', None)
        data.setdefault('logger', DEFAULT_LOGGER_NAME)
        data.setdefault('platform', None)
        data.setdefault('server_name', None)
        data.setdefault('site', None)
        data.setdefault('tags', [])
        data.setdefault('transaction', None)

        # Fix case where legacy apps pass 'environment' as a tag
        # instead of a top level key.
        # TODO (alex) save() just reinserts the environment into the tags
        if not data.get('environment'):
            tagsdict = dict(data['tags'])
            if 'environment' in tagsdict:
                data['environment'] = tagsdict['environment']
                del tagsdict['environment']
                data['tags'] = tagsdict.items()

        # the SDKs currently do not describe event types, and we must infer
        # them from available attributes
        data['type'] = eventtypes.infer(data).key
        data['version'] = self.version

        exception = data.get('sentry.interfaces.Exception')
        stacktrace = data.get('sentry.interfaces.Stacktrace')
        if exception and len(exception['values']) == 1 and stacktrace:
            exception['values'][0]['stacktrace'] = stacktrace
            del data['sentry.interfaces.Stacktrace']

        # Exception mechanism needs SDK information to resolve proper names in
        # exception meta (such as signal names). "SDK Information" really means
        # the operating system version the event was generated on. Some
        # normalization still works without sdk_info, such as mach_exception
        # names (they can only occur on macOS).
        if exception:
            sdk_info = get_sdk_from_event(data)
            for ex in exception['values']:
                if 'mechanism' in ex:
                    normalize_mechanism_meta(ex['mechanism'], sdk_info)

        # If there is no User ip_addres, update it either from the Http interface
        # or the client_ip of the request.
        auth = request_env.get('auth')
        is_public = auth and auth.is_public
        add_ip_platforms = ('javascript', 'cocoa', 'objc')

        http_ip = data.get('sentry.interfaces.Http', {}).get('env', {}).get('REMOTE_ADDR')
        if http_ip:
            data.setdefault('sentry.interfaces.User', {}).setdefault('ip_address', http_ip)
        elif client_ip and (is_public or data.get('platform') in add_ip_platforms):
            data.setdefault('sentry.interfaces.User', {}).setdefault('ip_address', client_ip)

        # Trim values
        data['logger'] = trim(data['logger'].strip(), 64)
        trim_dict(data['extra'], max_size=settings.SENTRY_MAX_EXTRA_VARIABLE_SIZE)

        if data['culprit']:
            data['culprit'] = trim(data['culprit'], MAX_CULPRIT_LENGTH)

        if data['transaction']:
            data['transaction'] = trim(data['transaction'], MAX_CULPRIT_LENGTH)

        return data
Exemplo n.º 4
0
    def save(self,
             project_id,
             raw=False,
             assume_normalized=False,
             cache_key=None):
        """
        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        data = self._data

        project = Project.objects.get_from_cache(id=project_id)
        project._organization_cache = Organization.objects.get_from_cache(
            id=project.organization_id)

        # Pull out the culprit
        culprit = self.get_culprit()

        # Pull the toplevel data we're interested in
        level = data.get("level")

        # TODO(mitsuhiko): this code path should be gone by July 2018.
        # This is going to be fine because no code actually still depends
        # on integers here.  When we need an integer it will be converted
        # into one later.  Old workers used to send integers here.
        if level is not None and isinstance(level, six.integer_types):
            level = LOG_LEVELS[level]

        transaction_name = data.get("transaction")
        logger_name = data.get("logger")
        release = data.get("release")
        dist = data.get("dist")
        environment = data.get("environment")
        recorded_timestamp = data.get("timestamp")

        # We need to swap out the data with the one internal to the newly
        # created event object
        event = self._get_event_instance(project_id=project_id)
        self._data = data = event.data.data

        event._project_cache = project

        date = event.datetime
        platform = event.platform
        event_id = event.event_id

        if transaction_name:
            transaction_name = force_text(transaction_name)

        # Right now the event type is the signal to skip the group. This
        # is going to change a lot.
        if event.get_event_type() == "transaction":
            issueless_event = True
        else:
            issueless_event = False

        # Some of the data that are toplevel attributes are duplicated
        # into tags (logger, level, environment, transaction).  These are
        # different from legacy attributes which are normalized into tags
        # ahead of time (site, server_name).
        setdefault_path(data, "tags", value=[])
        set_tag(data, "level", level)
        if logger_name:
            set_tag(data, "logger", logger_name)
        if environment:
            set_tag(data, "environment", environment)
        if transaction_name:
            set_tag(data, "transaction", transaction_name)

        if release:
            # dont allow a conflicting 'release' tag
            pop_tag(data, "release")
            release = Release.get_or_create(project=project,
                                            version=release,
                                            date_added=date)
            set_tag(data, "sentry:release", release.version)

        if dist and release:
            dist = release.add_dist(dist, date)
            # dont allow a conflicting 'dist' tag
            pop_tag(data, "dist")
            set_tag(data, "sentry:dist", dist.name)
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            pop_tag(data, "user")
            set_tag(data, "sentry:user", event_user.tag_value)

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        grouping_config = load_grouping_config(
            get_grouping_config_dict_for_event_data(data, project))
        normalize_stacktraces_for_grouping(data, grouping_config)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags,
                                      event,
                                      _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    if get_tag(data, key) is None:
                        set_tag(data, key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                set_tag(data, k, v)
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.path, None)

        # The active grouping config was put into the event in the
        # normalize step before.  We now also make sure that the
        # fingerprint was set to `'{{ default }}' just in case someone
        # removed it from the payload.  The call to get_hashes will then
        # look at `grouping_config` to pick the right parameters.
        data["fingerprint"] = data.get("fingerprint") or ["{{ default }}"]
        apply_server_fingerprinting(
            data, get_fingerprinting_config_for_project(project))

        # Here we try to use the grouping config that was requested in the
        # event.  If that config has since been deleted (because it was an
        # experimental grouping config) we fall back to the default.
        try:
            hashes = event.get_hashes()
        except GroupingConfigNotFound:
            data["grouping_config"] = get_grouping_config_dict_for_project(
                project)
            hashes = event.get_hashes()

        data["hashes"] = hashes

        # we want to freeze not just the metadata and type in but also the
        # derived attributes.  The reason for this is that we push this
        # data into kafka for snuba processing and our postprocessing
        # picks up the data right from the snuba topic.  For most usage
        # however the data is dynamically overridden by Event.title and
        # Event.location (See Event.as_dict)
        materialized_metadata = self.materialize_metadata()
        data.update(materialized_metadata)
        data["culprit"] = culprit

        received_timestamp = event.data.get("received") or float(
            event.datetime.strftime("%s"))

        if not issueless_event:
            # The group gets the same metadata as the event when it's flushed but
            # additionally the `last_received` key is set.  This key is used by
            # _save_aggregate.
            group_metadata = dict(materialized_metadata)
            group_metadata["last_received"] = received_timestamp
            kwargs = {
                "platform": platform,
                "message": event.search_message,
                "culprit": culprit,
                "logger": logger_name,
                "level": LOG_LEVELS_MAP.get(level),
                "last_seen": date,
                "first_seen": date,
                "active_at": date,
                "data": group_metadata,
            }

            if release:
                kwargs["first_release"] = release

            try:
                group, is_new, is_regression = self._save_aggregate(
                    event=event, hashes=hashes, release=release, **kwargs)
            except HashDiscarded:
                event_discarded.send_robust(project=project,
                                            sender=EventManager)

                metrics.incr(
                    "events.discarded",
                    skip_internal=True,
                    tags={
                        "organization_id": project.organization_id,
                        "platform": platform
                    },
                )
                raise
            else:
                event_saved.send_robust(project=project,
                                        event_size=event.size,
                                        sender=EventManager)
            event.group = group
        else:
            group = None
            is_new = False
            is_regression = False
            event_saved.send_robust(project=project,
                                    event_size=event.size,
                                    sender=EventManager)

        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        environment = Environment.get_or_create(project=project,
                                                name=environment)

        if group:
            group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
                group_id=group.id,
                environment_id=environment.id,
                defaults={"first_release": release if release else None},
            )
        else:
            is_new_group_environment = False

        if release:
            ReleaseEnvironment.get_or_create(project=project,
                                             release=release,
                                             environment=environment,
                                             datetime=date)

            ReleaseProjectEnvironment.get_or_create(project=project,
                                                    release=release,
                                                    environment=environment,
                                                    datetime=date)

            if group:
                grouprelease = GroupRelease.get_or_create(
                    group=group,
                    release=release,
                    environment=environment,
                    datetime=date)

        counters = [(tsdb.models.project, project.id)]

        if group:
            counters.append((tsdb.models.group, group.id))

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters,
                        timestamp=event.datetime,
                        environment_id=environment.id)

        frequencies = []

        if group:
            frequencies.append((tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1
                }
            }))

            if release:
                frequencies.append((tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1
                    }
                }))
        if frequencies:
            tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        if group:
            UserReport.objects.filter(project=project,
                                      event_id=event_id).update(
                                          group=group, environment=environment)

        # Enusre the _metrics key exists. This is usually created during
        # and prefilled with ingestion sizes.
        event_metrics = event.data.get("_metrics") or {}
        event.data["_metrics"] = event_metrics

        # Capture the actual size that goes into node store.
        event_metrics["bytes.stored.event"] = len(
            json.dumps(dict(event.data.items())))

        # Load attachments first, but persist them at the very last after
        # posting to eventstream to make sure all counters and eventstream are
        # incremented for sure.
        attachments = self.get_attachments(cache_key, event)
        for attachment in attachments:
            key = "bytes.stored.%s" % (attachment.type, )
            event_metrics[key] = (event_metrics.get(key) or 0) + len(
                attachment.data)

        # Write the event to Nodestore
        event.data.save()

        if event_user:
            counters = [(tsdb.models.users_affected_by_project, project.id,
                         (event_user.tag_value, ))]

            if group:
                counters.append((tsdb.models.users_affected_by_group, group.id,
                                 (event_user.tag_value, )))

            tsdb.record_multi(counters,
                              timestamp=event.datetime,
                              environment_id=environment.id)

        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {
                        "release_id": release.id,
                        "project_id": project.id
                    },
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": release.id,
                        "environment_id": environment.id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project,
                                                 event=event,
                                                 sender=Project)

        eventstream.insert(
            group=group,
            event=event,
            is_new=is_new,
            is_regression=is_regression,
            is_new_group_environment=is_new_group_environment,
            primary_hash=hashes[0],
            # We are choosing to skip consuming the event back
            # in the eventstream if it's flagged as raw.
            # This means that we want to publish the event
            # through the event stream, but we don't care
            # about post processing and handling the commit.
            skip_consume=raw,
        )

        # Do this last to ensure signals get emitted even if connection to the
        # file store breaks temporarily.
        self.save_attachments(attachments, event)

        metric_tags = {"from_relay": "_relay_processed" in self._data}

        metrics.timing("events.latency",
                       received_timestamp - recorded_timestamp,
                       tags=metric_tags)
        metrics.timing("events.size.data.post_save",
                       event.size,
                       tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(self._data.get("errors") or ()),
            tags=metric_tags,
        )

        return event
Exemplo n.º 5
0
    def normalize(self, request_env=None):
        request_env = request_env or {}
        data = self.data
        errors = data['errors'] = []

        # Before validating with a schema, attempt to cast values to their desired types
        # so that the schema doesn't have to take every type variation into account.
        text = six.text_type
        fp_types = six.string_types + six.integer_types + (float, )

        def to_values(v):
            return {'values': v} if v and isinstance(v, (tuple, list)) else v

        def convert_fingerprint(values):
            rv = values[:]
            bad_float = False
            for idx, item in enumerate(rv):
                if isinstance(item, float) and \
                   (abs(item) >= (1 << 53) or int(item) != item):
                    bad_float = True
                rv[idx] = text(item)
            if bad_float:
                metrics.incr(
                    'events.bad_float_fingerprint',
                    skip_internal=True,
                    tags={
                        'project_id': data.get('project'),
                    },
                )
            return rv

        casts = {
            'environment':
            lambda v: text(v) if v is not None else v,
            'fingerprint':
            lambda v: convert_fingerprint(v)
            if isinstance(v, list) and all(isinstance(f, fp_types)
                                           for f in v) else v,
            'release':
            lambda v: text(v) if v is not None else v,
            'dist':
            lambda v: text(v).strip() if v is not None else v,
            'time_spent':
            lambda v: int(v) if v is not None else v,
            'tags':
            lambda v: [(text(v_k).replace(' ', '-').strip(), text(v_v).strip())
                       for (v_k, v_v) in dict(v).items()],
            'timestamp':
            lambda v: process_timestamp(v),
            'platform':
            lambda v: v if v in VALID_PLATFORMS else 'other',
            'sentry.interfaces.Message':
            lambda v: v if isinstance(v, dict) else {
                'message': v
            },

            # These can be sent as lists and need to be converted to {'values': [...]}
            'exception':
            to_values,
            'sentry.interfaces.Exception':
            to_values,
            'breadcrumbs':
            to_values,
            'sentry.interfaces.Breadcrumbs':
            to_values,
            'threads':
            to_values,
            'sentry.interfaces.Threads':
            to_values,
        }

        for c in casts:
            if c in data:
                try:
                    data[c] = casts[c](data[c])
                except InvalidTimestamp as it:
                    errors.append({
                        'type': it.args[0],
                        'name': c,
                        'value': data[c]
                    })
                    del data[c]
                except Exception as e:
                    errors.append({
                        'type': EventError.INVALID_DATA,
                        'name': c,
                        'value': data[c]
                    })
                    del data[c]

        # raw 'message' is coerced to the Message interface, as its used for pure index of
        # searchable strings. If both a raw 'message' and a Message interface exist, try and
        # add the former as the 'formatted' attribute of the latter.
        # See GH-3248
        msg_str = data.pop('message', None)
        if msg_str:
            msg_if = data.setdefault('sentry.interfaces.Message',
                                     {'message': msg_str})
            if msg_if.get('message') != msg_str:
                msg_if.setdefault('formatted', msg_str)

        # Fill in ip addresses marked as {{auto}}
        client_ip = request_env.get('client_ip')
        if client_ip:
            if get_path(data, ['sentry.interfaces.Http', 'env', 'REMOTE_ADDR'
                               ]) == '{{auto}}':
                data['sentry.interfaces.Http']['env'][
                    'REMOTE_ADDR'] = client_ip

            if get_path(data, ['request', 'env', 'REMOTE_ADDR']) == '{{auto}}':
                data['request']['env']['REMOTE_ADDR'] = client_ip

            if get_path(
                    data,
                ['sentry.interfaces.User', 'ip_address']) == '{{auto}}':
                data['sentry.interfaces.User']['ip_address'] = client_ip

            if get_path(data, ['user', 'ip_address']) == '{{auto}}':
                data['user']['ip_address'] = client_ip

        # Validate main event body and tags against schema
        is_valid, event_errors = validate_and_default_interface(data, 'event')
        errors.extend(event_errors)
        if 'tags' in data:
            is_valid, tag_errors = validate_and_default_interface(data['tags'],
                                                                  'tags',
                                                                  name='tags')
            errors.extend(tag_errors)

        # Validate interfaces
        for k in list(iter(data)):
            if k in CLIENT_RESERVED_ATTRS:
                continue

            value = data.pop(k)

            if not value:
                self.logger.debug('Ignored empty interface value: %s', k)
                continue

            try:
                interface = get_interface(k)
            except ValueError:
                self.logger.debug('Ignored unknown attribute: %s', k)
                errors.append({
                    'type': EventError.INVALID_ATTRIBUTE,
                    'name': k
                })
                continue

            try:
                inst = interface.to_python(value)
                data[inst.get_path()] = inst.to_json()
            except Exception as e:
                log = self.logger.debug if isinstance(
                    e, InterfaceValidationError) else self.logger.error
                log('Discarded invalid value for interface: %s (%r)',
                    k,
                    value,
                    exc_info=True)
                errors.append({
                    'type': EventError.INVALID_DATA,
                    'name': k,
                    'value': value
                })

        # Additional data coercion and defaulting
        level = data.get('level') or DEFAULT_LOG_LEVEL
        if isinstance(level, int) or (isinstance(level, six.string_types)
                                      and level.isdigit()):
            level = LOG_LEVELS.get(int(level), DEFAULT_LOG_LEVEL)
        data['level'] = LOG_LEVELS_MAP.get(level,
                                           LOG_LEVELS_MAP[DEFAULT_LOG_LEVEL])

        if data.get('dist') and not data.get('release'):
            data['dist'] = None

        timestamp = data.get('timestamp')
        if not timestamp:
            timestamp = timezone.now()

        # TODO (alex) can this all be replaced by utcnow?
        # it looks like the only time that this would even be hit is when timestamp
        # is not defined, as the earlier process_timestamp already converts existing
        # timestamps to floats.
        if isinstance(timestamp, datetime):
            # We must convert date to local time so Django doesn't mess it up
            # based on TIME_ZONE
            if settings.TIME_ZONE:
                if not timezone.is_aware(timestamp):
                    timestamp = timestamp.replace(tzinfo=timezone.utc)
            elif timezone.is_aware(timestamp):
                timestamp = timestamp.replace(tzinfo=None)
            timestamp = float(timestamp.strftime('%s'))

        data['timestamp'] = timestamp
        data['received'] = float(timezone.now().strftime('%s'))

        data.setdefault('checksum', None)
        data.setdefault('culprit', None)
        data.setdefault('dist', None)
        data.setdefault('environment', None)
        data.setdefault('extra', {})
        data.setdefault('fingerprint', None)
        data.setdefault('logger', DEFAULT_LOGGER_NAME)
        data.setdefault('platform', None)
        data.setdefault('server_name', None)
        data.setdefault('site', None)
        data.setdefault('tags', [])
        data.setdefault('transaction', None)

        # Fix case where legacy apps pass 'environment' as a tag
        # instead of a top level key.
        # TODO (alex) save() just reinserts the environment into the tags
        if not data.get('environment'):
            tagsdict = dict(data['tags'])
            if 'environment' in tagsdict:
                data['environment'] = tagsdict['environment']
                del tagsdict['environment']
                data['tags'] = tagsdict.items()

        # the SDKs currently do not describe event types, and we must infer
        # them from available attributes
        data['type'] = eventtypes.infer(data).key
        data['version'] = self.version

        exception = data.get('sentry.interfaces.Exception')
        stacktrace = data.get('sentry.interfaces.Stacktrace')
        if exception and len(exception['values']) == 1 and stacktrace:
            exception['values'][0]['stacktrace'] = stacktrace
            del data['sentry.interfaces.Stacktrace']

        # Exception mechanism needs SDK information to resolve proper names in
        # exception meta (such as signal names). "SDK Information" really means
        # the operating system version the event was generated on. Some
        # normalization still works without sdk_info, such as mach_exception
        # names (they can only occur on macOS).
        if exception:
            sdk_info = get_sdk_from_event(data)
            for ex in exception['values']:
                if 'mechanism' in ex:
                    normalize_mechanism_meta(ex['mechanism'], sdk_info)

        # If there is no User ip_addres, update it either from the Http interface
        # or the client_ip of the request.
        auth = request_env.get('auth')
        is_public = auth and auth.is_public
        add_ip_platforms = ('javascript', 'cocoa', 'objc')

        http_ip = data.get('sentry.interfaces.Http',
                           {}).get('env', {}).get('REMOTE_ADDR')
        if http_ip:
            data.setdefault('sentry.interfaces.User',
                            {}).setdefault('ip_address', http_ip)
        elif client_ip and (is_public
                            or data.get('platform') in add_ip_platforms):
            data.setdefault('sentry.interfaces.User',
                            {}).setdefault('ip_address', client_ip)

        # Trim values
        data['logger'] = trim(data['logger'].strip(), 64)
        trim_dict(data['extra'],
                  max_size=settings.SENTRY_MAX_EXTRA_VARIABLE_SIZE)

        if data['culprit']:
            data['culprit'] = trim(data['culprit'], MAX_CULPRIT_LENGTH)

        if data['transaction']:
            data['transaction'] = trim(data['transaction'], MAX_CULPRIT_LENGTH)

        return data
Exemplo n.º 6
0
        event.platform,
    ),
    'data':
    lambda event: {
        'last_received':
        event.data.get('received') or float(event.datetime.strftime('%s')),
        'type':
        event.data['type'],
        'metadata':
        event.data['metadata'],
    },
    'last_seen':
    lambda event: event.datetime,
    'level':
    lambda event: LOG_LEVELS_MAP.get(
        event.get_tag('level'),
        logging.ERROR,
    ),
    'message':
    lambda event: event.message,
    'times_seen':
    lambda event: 0,
}

backfill_fields = {
    'platform':
    lambda caches, data, event: event.platform,
    'logger':
    lambda caches, data, event: event.get_tag('logger') or DEFAULT_LOGGER_NAME,
    'first_seen':
    lambda caches, data, event: event.datetime,
    'active_at':
Exemplo n.º 7
0
    def save(self, project_id, raw=False, assume_normalized=False):
        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        data = self._data

        project = Project.objects.get_from_cache(id=project_id)
        project._organization_cache = Organization.objects.get_from_cache(
            id=project.organization_id)

        # Check to make sure we're not about to do a bunch of work that's
        # already been done if we've processed an event with this ID. (This
        # isn't a perfect solution -- this doesn't handle ``EventMapping`` and
        # there's a race condition between here and when the event is actually
        # saved, but it's an improvement. See GH-7677.)
        try:
            event = Event.objects.get(
                project_id=project.id,
                event_id=data['event_id'],
            )
        except Event.DoesNotExist:
            pass
        else:
            # Make sure we cache on the project before returning
            event._project_cache = project
            logger.info(
                'duplicate.found',
                exc_info=True,
                extra={
                    'event_uuid': data['event_id'],
                    'project_id': project.id,
                    'model': Event.__name__,
                }
            )
            return event

        # Pull out the culprit
        culprit = self.get_culprit()

        # Pull the toplevel data we're interested in
        level = data.get('level')

        # TODO(mitsuhiko): this code path should be gone by July 2018.
        # This is going to be fine because no code actually still depends
        # on integers here.  When we need an integer it will be converted
        # into one later.  Old workers used to send integers here.
        if level is not None and isinstance(level, six.integer_types):
            level = LOG_LEVELS[level]

        transaction_name = data.get('transaction')
        logger_name = data.get('logger')
        release = data.get('release')
        dist = data.get('dist')
        environment = data.get('environment')
        recorded_timestamp = data.get('timestamp')

        # We need to swap out the data with the one internal to the newly
        # created event object
        event = self._get_event_instance(project_id=project_id)
        self._data = data = event.data.data

        event._project_cache = project

        date = event.datetime
        platform = event.platform
        event_id = event.event_id

        if transaction_name:
            transaction_name = force_text(transaction_name)

        # Some of the data that are toplevel attributes are duplicated
        # into tags (logger, level, environment, transaction).  These are
        # different from legacy attributes which are normalized into tags
        # ahead of time (site, server_name).
        setdefault_path(data, 'tags', value=[])
        set_tag(data, 'level', level)
        if logger_name:
            set_tag(data, 'logger', logger_name)
        if environment:
            set_tag(data, 'environment', environment)
        if transaction_name:
            set_tag(data, 'transaction', transaction_name)

        if release:
            # dont allow a conflicting 'release' tag
            pop_tag(data, 'release')
            release = Release.get_or_create(
                project=project,
                version=release,
                date_added=date,
            )
            set_tag(data, 'sentry:release', release.version)

        if dist and release:
            dist = release.add_dist(dist, date)
            # dont allow a conflicting 'dist' tag
            pop_tag(data, 'dist')
            set_tag(data, 'sentry:dist', dist.name)
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            pop_tag(data, 'user')
            set_tag(data, 'sentry:user', event_user.tag_value)

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        grouping_config = load_grouping_config(
            get_grouping_config_dict_for_event_data(data, project))
        normalize_stacktraces_for_grouping(data, grouping_config)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    if get_tag(data, key) is None:
                        set_tag(data, key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                set_tag(data, k, v)
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.path, None)

        # The active grouping config was put into the event in the
        # normalize step before.  We now also make sure that the
        # fingerprint was set to `'{{ default }}' just in case someone
        # removed it from the payload.  The call to get_hashes will then
        # look at `grouping_config` to pick the right paramters.
        data['fingerprint'] = data.get('fingerprint') or ['{{ default }}']
        apply_server_fingerprinting(data, get_fingerprinting_config_for_project(project))

        # Here we try to use the grouping config that was requested in the
        # event.  If that config has since been deleted (because it was an
        # experimental grouping config) we fall back to the default.
        try:
            hashes = event.get_hashes()
        except GroupingConfigNotFound:
            data['grouping_config'] = get_grouping_config_dict_for_project(project)
            hashes = event.get_hashes()

        data['hashes'] = hashes

        # we want to freeze not just the metadata and type in but also the
        # derived attributes.  The reason for this is that we push this
        # data into kafka for snuba processing and our postprocessing
        # picks up the data right from the snuba topic.  For most usage
        # however the data is dynamically overriden by Event.title and
        # Event.location (See Event.as_dict)
        materialized_metadata = self.materialize_metadata()
        event_metadata = materialized_metadata['metadata']
        data.update(materialized_metadata)
        data['culprit'] = culprit

        # index components into ``Event.message``
        # See GH-3248
        event.message = self.get_search_message(event_metadata, culprit)
        received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))

        # The group gets the same metadata as the event when it's flushed but
        # additionally the `last_received` key is set.  This key is used by
        # _save_aggregate.
        group_metadata = dict(materialized_metadata)
        group_metadata['last_received'] = received_timestamp
        kwargs = {
            'platform': platform,
            'message': event.message,
            'culprit': culprit,
            'logger': logger_name,
            'level': LOG_LEVELS_MAP.get(level),
            'last_seen': date,
            'first_seen': date,
            'active_at': date,
            'data': group_metadata,
        }

        if release:
            kwargs['first_release'] = release

        try:
            group, is_new, is_regression, is_sample = self._save_aggregate(
                event=event, hashes=hashes, release=release, **kwargs
            )
        except HashDiscarded:
            event_discarded.send_robust(
                project=project,
                sender=EventManager,
            )

            metrics.incr(
                'events.discarded',
                skip_internal=True,
                tags={
                    'organization_id': project.organization_id,
                    'platform': platform,
                },
            )
            raise
        else:
            event_saved.send_robust(
                project=project,
                event_size=event.size,
                sender=EventManager,
            )

        event.group = group
        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        # When an event was sampled, the canonical source of truth
        # is the EventMapping table since we aren't going to be writing out an actual
        # Event row. Otherwise, if the Event isn't being sampled, we can safely
        # rely on the Event table itself as the source of truth and ignore
        # EventMapping since it's redundant information.
        if is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(EventMapping)):
                    EventMapping.objects.create(project=project, group=group, event_id=event_id)
            except IntegrityError:
                logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': EventMapping.__name__,
                    }
                )
                return event

        environment = Environment.get_or_create(
            project=project,
            name=environment,
        )

        group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
            group_id=group.id,
            environment_id=environment.id,
            defaults={
                'first_release': release if release else None,
            },
        )

        if release:
            ReleaseEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            ReleaseProjectEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            grouprelease = GroupRelease.get_or_create(
                group=group,
                release=release,
                environment=environment,
                datetime=date,
            )

        counters = [
            (tsdb.models.group, group.id),
            (tsdb.models.project, project.id),
        ]

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id)

        frequencies = [
            # (tsdb.models.frequent_projects_by_organization, {
            #     project.organization_id: {
            #         project.id: 1,
            #     },
            # }),
            # (tsdb.models.frequent_issues_by_project, {
            #     project.id: {
            #         group.id: 1,
            #     },
            # })
            (tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1,
                },
            })
        ]

        if release:
            frequencies.append(
                (tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1,
                    },
                })
            )

        tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        UserReport.objects.filter(
            project=project,
            event_id=event_id,
        ).update(
            group=group,
            environment=environment,
        )

        # Update any event attachment that arrived before the event group was defined.
        EventAttachment.objects.filter(
            project_id=project.id,
            event_id=event_id,
        ).update(
            group_id=group.id,
        )

        # save the event unless its been sampled
        if not is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(Event)):
                    event.save()
            except IntegrityError:
                logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': Event.__name__,
                    }
                )
                return event

            tagstore.delay_index_event_tags(
                organization_id=project.organization_id,
                project_id=project.id,
                group_id=group.id,
                environment_id=environment.id,
                event_id=event.id,
                tags=event.tags,
                date_added=event.datetime,
            )

        if event_user:
            tsdb.record_multi(
                (
                    (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )),
                    (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )),
                ),
                timestamp=event.datetime,
                environment_id=environment.id,
            )
        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject, {'new_groups': 1}, {
                        'release_id': release.id,
                        'project_id': project.id,
                    }
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment, {'new_issues_count': 1}, {
                        'project_id': project.id,
                        'release_id': release.id,
                        'environment_id': environment.id,
                    }
                )

        safe_execute(
            Group.objects.add_tags,
            group,
            environment,
            event.get_tags(),
            _with_transaction=False)

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project, group=group, sender=Project)

        eventstream.insert(
            group=group,
            event=event,
            is_new=is_new,
            is_sample=is_sample,
            is_regression=is_regression,
            is_new_group_environment=is_new_group_environment,
            primary_hash=hashes[0],
            # We are choosing to skip consuming the event back
            # in the eventstream if it's flagged as raw.
            # This means that we want to publish the event
            # through the event stream, but we don't care
            # about post processing and handling the commit.
            skip_consume=raw,
        )

        metrics.timing(
            'events.latency',
            received_timestamp - recorded_timestamp,
            tags={
                'project_id': project.id,
            },
        )

        metrics.timing(
            'events.size.data.post_save',
            event.size,
            tags={'project_id': project.id}
        )

        return event
Exemplo n.º 8
0
initial_fields = {
    "culprit":
    lambda event: _generate_culprit(event),
    "data":
    lambda event: {
        "last_received":
        event.data.get("received") or float(event.datetime.strftime("%s")),
        "type":
        event.data["type"],
        "metadata":
        event.data["metadata"],
    },
    "last_seen":
    lambda event: event.datetime,
    "level":
    lambda event: LOG_LEVELS_MAP.get(event.get_tag("level"), logging.ERROR),
    "message":
    lambda event: event.message,
    "times_seen":
    lambda event: 0,
}

backfill_fields = {
    "platform":
    lambda caches, data, event: event.platform,
    "logger":
    lambda caches, data, event: event.get_tag("logger") or DEFAULT_LOGGER_NAME,
    "first_seen":
    lambda caches, data, event: event.datetime,
    "active_at":
    lambda caches, data, event: event.datetime,
Exemplo n.º 9
0
    def validate_data(self, project, data):
        # TODO(dcramer): move project out of the data packet
        data['project'] = project.id

        data['errors'] = []

        if data.get('culprit'):
            if not isinstance(data['culprit'], six.string_types):
                raise APIForbidden('Invalid value for culprit')

        if not data.get('event_id'):
            data['event_id'] = uuid.uuid4().hex
        elif not isinstance(data['event_id'], six.string_types):
            raise APIForbidden('Invalid value for event_id')

        if len(data['event_id']) > 32:
            self.log.debug(
                'Discarded value for event_id due to length (%d chars)',
                len(data['event_id']))
            data['errors'].append({
                'type': EventError.VALUE_TOO_LONG,
                'name': 'event_id',
                'value': data['event_id'],
            })
            data['event_id'] = uuid.uuid4().hex
        elif not is_event_id(data['event_id']):
            self.log.debug(
                'Discarded invalid value for event_id: %r',
                data['event_id'], exc_info=True)
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'event_id',
                'value': data['event_id'],
            })
            data['event_id'] = uuid.uuid4().hex

        if 'timestamp' in data:
            try:
                self._process_data_timestamp(data)
            except InvalidTimestamp as e:
                self.log.debug(
                    'Discarded invalid value for timestamp: %r',
                    data['timestamp'], exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'timestamp',
                    'value': data['timestamp'],
                })
                del data['timestamp']

        if 'fingerprint' in data:
            try:
                self._process_fingerprint(data)
            except InvalidFingerprint as e:
                self.log.debug(
                    'Discarded invalid value for fingerprint: %r',
                    data['fingerprint'], exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'fingerprint',
                    'value': data['fingerprint'],
                })
                del data['fingerprint']

        if 'platform' not in data or data['platform'] not in VALID_PLATFORMS:
            data['platform'] = 'other'

        if data.get('modules') and type(data['modules']) != dict:
            self.log.debug(
                'Discarded invalid type for modules: %s',
                type(data['modules']))
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'modules',
                'value': data['modules'],
            })
            del data['modules']

        if data.get('extra') is not None and type(data['extra']) != dict:
            self.log.debug(
                'Discarded invalid type for extra: %s',
                type(data['extra']))
            data['errors'].append({
                'type': EventError.INVALID_DATA,
                'name': 'extra',
                'value': data['extra'],
            })
            del data['extra']

        if data.get('tags') is not None:
            if type(data['tags']) == dict:
                data['tags'] = list(data['tags'].items())
            elif not isinstance(data['tags'], (list, tuple)):
                self.log.debug(
                    'Discarded invalid type for tags: %s', type(data['tags']))
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'tags',
                    'value': data['tags'],
                })
                del data['tags']

        if data.get('tags'):
            # remove any values which are over 32 characters
            tags = []
            for pair in data['tags']:
                try:
                    k, v = pair
                except ValueError:
                    self.log.debug('Discarded invalid tag value: %r', pair)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not isinstance(k, six.string_types):
                    try:
                        k = six.text_type(k)
                    except Exception:
                        self.log.debug('Discarded invalid tag key: %r', type(k))
                        data['errors'].append({
                            'type': EventError.INVALID_DATA,
                            'name': 'tags',
                            'value': pair,
                        })
                        continue

                if not isinstance(v, six.string_types):
                    try:
                        v = six.text_type(v)
                    except Exception:
                        self.log.debug('Discarded invalid tag value: %s=%r',
                                      k, type(v))
                        data['errors'].append({
                            'type': EventError.INVALID_DATA,
                            'name': 'tags',
                            'value': pair,
                        })
                        continue

                if len(k) > MAX_TAG_KEY_LENGTH or len(v) > MAX_TAG_VALUE_LENGTH:
                    self.log.debug('Discarded invalid tag: %s=%s', k, v)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                # support tags with spaces by converting them
                k = k.replace(' ', '-')

                if TagKey.is_reserved_key(k):
                    self.log.debug('Discarding reserved tag key: %s', k)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not TagKey.is_valid_key(k):
                    self.log.debug('Discarded invalid tag key: %s', k)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                if not TagValue.is_valid_value(v):
                    self.log.debug('Discard invalid tag value: %s', v)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': 'tags',
                        'value': pair,
                    })
                    continue

                tags.append((k, v))
            data['tags'] = tags

        for k in list(iter(data)):
            if k in CLIENT_RESERVED_ATTRS:
                continue

            value = data.pop(k)

            if not value:
                self.log.debug('Ignored empty interface value: %s', k)
                continue

            try:
                interface = get_interface(k)
            except ValueError:
                self.log.debug('Ignored unknown attribute: %s', k)
                data['errors'].append({
                    'type': EventError.INVALID_ATTRIBUTE,
                    'name': k,
                })
                continue

            if type(value) != dict:
                # HACK(dcramer): the exception/breadcrumbs interface supports a
                # list as the value. We should change this in a new protocol
                # version.
                if type(value) in (list, tuple):
                    value = {'values': value}
                else:
                    self.log.debug(
                        'Invalid parameter for value: %s (%r)', k, type(value))
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': k,
                        'value': value,
                    })
                    continue

            try:
                inst = interface.to_python(value)
                data[inst.get_path()] = inst.to_json()
            except Exception as e:
                if isinstance(e, InterfaceValidationError):
                    log = self.log.debug
                else:
                    log = self.log.error
                log('Discarded invalid value for interface: %s (%r)', k, value,
                    exc_info=True)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': k,
                    'value': value,
                })

        # TODO(dcramer): ideally this logic would happen in normalize, but today
        # we don't do "validation" there (create errors)

        # message is coerced to an interface, as its used for pure
        # index of searchable strings
        # See GH-3248
        message = data.pop('message', None)
        if message:
            if 'sentry.interfaces.Message' not in data:
                value = {
                    'message': message,
                }
            elif not data['sentry.interfaces.Message'].get('formatted'):
                value = data['sentry.interfaces.Message']
                value['formatted'] = message
            else:
                value = None

            if value is not None:
                k = 'sentry.interfaces.Message'
                interface = get_interface(k)
                try:
                    inst = interface.to_python(value)
                    data[inst.get_path()] = inst.to_json()
                except Exception as e:
                    if isinstance(e, InterfaceValidationError):
                        log = self.log.debug
                    else:
                        log = self.log.error
                    log('Discarded invalid value for interface: %s (%r)', k, value,
                        exc_info=True)
                    data['errors'].append({
                        'type': EventError.INVALID_DATA,
                        'name': k,
                        'value': value,
                    })

        level = data.get('level') or DEFAULT_LOG_LEVEL
        if isinstance(level, six.string_types) and not level.isdigit():
            # assume it's something like 'warning'
            try:
                data['level'] = LOG_LEVELS_MAP[level]
            except KeyError as e:
                self.log.debug(
                    'Discarded invalid logger value: %s', level)
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'level',
                    'value': level,
                })
                data['level'] = LOG_LEVELS_MAP.get(
                    DEFAULT_LOG_LEVEL, DEFAULT_LOG_LEVEL)

        if data.get('release'):
            data['release'] = six.text_type(data['release'])
            if len(data['release']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'release',
                    'value': data['release'],
                })
                del data['release']

        if data.get('dist'):
            data['dist'] = six.text_type(data['dist']).strip()
            if not data.get('release'):
                data['dist'] = None
            elif len(data['dist']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'dist',
                    'value': data['dist'],
                })
                del data['dist']
            elif _dist_re.match(data['dist']) is None:
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'dist',
                    'value': data['dist'],
                })
                del data['dist']

        if data.get('environment'):
            data['environment'] = six.text_type(data['environment'])
            if len(data['environment']) > 64:
                data['errors'].append({
                    'type': EventError.VALUE_TOO_LONG,
                    'name': 'environment',
                    'value': data['environment'],
                })
                del data['environment']

        if data.get('time_spent'):
            try:
                data['time_spent'] = int(data['time_spent'])
            except (ValueError, TypeError):
                data['errors'].append({
                    'type': EventError.INVALID_DATA,
                    'name': 'time_spent',
                    'value': data['time_spent'],
                })
                del data['time_spent']
            else:
                if data['time_spent'] > BoundedIntegerField.MAX_VALUE:
                    data['errors'].append({
                        'type': EventError.VALUE_TOO_LONG,
                        'name': 'time_spent',
                        'value': data['time_spent'],
                    })
                    del data['time_spent']

        return data
Exemplo n.º 10
0
    def normalize(self):
        data = self.data
        errors = data.get('errors', [])

        # Before validating with a schema, attempt to cast values to their desired types
        # so that the schema doesn't have to take every type variation into account.
        text = six.text_type
        fp_types = six.string_types + six.integer_types + (float, )

        def to_values(v):
            return {'values': v} if v and isinstance(v, (tuple, list)) else v

        casts = {
            'environment':
            lambda v: text(v) if v is not None else v,
            'fingerprint':
            lambda v: list(map(text, v))
            if isinstance(v, list) and all(isinstance(f, fp_types)
                                           for f in v) else v,
            'release':
            lambda v: text(v) if v is not None else v,
            'dist':
            lambda v: text(v).strip() if v is not None else v,
            'time_spent':
            lambda v: int(v) if v is not None else v,
            'tags':
            lambda v: [(text(v_k).replace(' ', '-').strip(), text(v_v).strip())
                       for (v_k, v_v) in dict(v).items()],
            'timestamp':
            lambda v: process_timestamp(v),
            'platform':
            lambda v: v if v in VALID_PLATFORMS else 'other',

            # These can be sent as lists and need to be converted to {'values': [...]}
            'exception':
            to_values,
            'sentry.interfaces.Exception':
            to_values,
            'breadcrumbs':
            to_values,
            'sentry.interfaces.Breadcrumbs':
            to_values,
            'threads':
            to_values,
            'sentry.interfaces.Threads':
            to_values,
        }

        for c in casts:
            if c in data:
                try:
                    data[c] = casts[c](data[c])
                except Exception as e:
                    errors.append({
                        'type': EventError.INVALID_DATA,
                        'name': c,
                        'value': data[c]
                    })
                    del data[c]

        # raw 'message' is coerced to the Message interface, as its used for pure index of
        # searchable strings. If both a raw 'message' and a Message interface exist, try and
        # add the former as the 'formatted' attribute of the latter.
        # See GH-3248
        msg_str = data.pop('message', None)
        if msg_str:
            msg_if = data.setdefault('sentry.interfaces.Message',
                                     {'message': msg_str})
            if msg_if.get('message') != msg_str:
                msg_if.setdefault('formatted', msg_str)

        # Validate main event body and tags against schema
        is_valid, event_errors = validate_and_default_interface(data, 'event')
        errors.extend(event_errors)

        if 'tags' in data:
            is_valid, tag_errors = validate_and_default_interface(data['tags'],
                                                                  'tags',
                                                                  name='tags')
            errors.extend(tag_errors)

        # Validate interfaces
        for k in list(iter(data)):
            if k in CLIENT_RESERVED_ATTRS:
                continue

            value = data.pop(k)

            if not value:
                self.logger.debug('Ignored empty interface value: %s', k)
                continue

            try:
                interface = get_interface(k)
            except ValueError:
                self.logger.debug('Ignored unknown attribute: %s', k)
                errors.append({
                    'type': EventError.INVALID_ATTRIBUTE,
                    'name': k
                })
                continue

            try:
                inst = interface.to_python(value)
                data[inst.get_path()] = inst.to_json()
            except Exception as e:
                log = self.logger.debug if isinstance(
                    e, InterfaceValidationError) else self.logger.error
                log('Discarded invalid value for interface: %s (%r)',
                    k,
                    value,
                    exc_info=True)
                errors.append({
                    'type': EventError.INVALID_DATA,
                    'name': k,
                    'value': value
                })

        level = data.get('level') or DEFAULT_LOG_LEVEL
        if isinstance(level, int) or (isinstance(level, six.string_types)
                                      and level.isdigit()):
            level = LOG_LEVELS.get(int(level), DEFAULT_LOG_LEVEL)
        data['level'] = LOG_LEVELS_MAP.get(level,
                                           LOG_LEVELS_MAP[DEFAULT_LOG_LEVEL])

        if data.get('dist') and not data.get('release'):
            data['dist'] = None

        timestamp = data.get('timestamp')
        if not timestamp:
            timestamp = timezone.now()

        # TODO (alex) can this all be replaced by utcnow?
        # it looks like the only time that this would even be hit is when timestamp
        # is not defined, as the earlier process_timestamp already converts existing
        # timestamps to floats.
        if isinstance(timestamp, datetime):
            # We must convert date to local time so Django doesn't mess it up
            # based on TIME_ZONE
            if settings.TIME_ZONE:
                if not timezone.is_aware(timestamp):
                    timestamp = timestamp.replace(tzinfo=timezone.utc)
            elif timezone.is_aware(timestamp):
                timestamp = timestamp.replace(tzinfo=None)
            timestamp = float(timestamp.strftime('%s'))

        data['timestamp'] = timestamp
        data['received'] = float(timezone.now().strftime('%s'))

        data.setdefault('culprit', None)
        data.setdefault('transaction', None)
        data.setdefault('server_name', None)
        data.setdefault('site', None)
        data.setdefault('checksum', None)
        data.setdefault('fingerprint', None)
        data.setdefault('platform', None)
        data.setdefault('dist', None)
        data.setdefault('environment', None)
        data.setdefault('extra', {})
        data.setdefault('tags', [])

        # Fix case where legacy apps pass 'environment' as a tag
        # instead of a top level key.
        # TODO (alex) save() just reinserts the environment into the tags
        if not data.get('environment'):
            tagsdict = dict(data['tags'])
            if 'environment' in tagsdict:
                data['environment'] = tagsdict['environment']
                del tagsdict['environment']
                data['tags'] = tagsdict.items()

        # the SDKs currently do not describe event types, and we must infer
        # them from available attributes
        data['type'] = eventtypes.infer(data).key
        data['version'] = self.version

        exception = data.get('sentry.interfaces.Exception')
        stacktrace = data.get('sentry.interfaces.Stacktrace')
        if exception and len(exception['values']) == 1 and stacktrace:
            exception['values'][0]['stacktrace'] = stacktrace
            del data['sentry.interfaces.Stacktrace']

        if 'sentry.interfaces.Http' in data:
            try:
                ip_address = validate_ip(
                    data['sentry.interfaces.Http'].get('env',
                                                       {}).get('REMOTE_ADDR'),
                    required=False,
                )
                if ip_address:
                    data.setdefault('sentry.interfaces.User',
                                    {}).setdefault('ip_address', ip_address)
            except ValueError:
                pass

        # Trim values
        logger = data.get('logger', DEFAULT_LOGGER_NAME)
        data['logger'] = trim(logger.strip(), 64)

        trim_dict(data['extra'],
                  max_size=settings.SENTRY_MAX_EXTRA_VARIABLE_SIZE)

        if data['culprit']:
            data['culprit'] = trim(data['culprit'], MAX_CULPRIT_LENGTH)

        if data['transaction']:
            data['transaction'] = trim(data['transaction'], MAX_CULPRIT_LENGTH)

        data['errors'] = errors

        return data
Exemplo n.º 11
0
    return result


initial_fields = {
    'culprit': lambda event: generate_culprit(
        event.data,
        event.platform,
    ),
    'data': lambda event: {
        'last_received': event.data.get('received') or float(event.datetime.strftime('%s')),
        'type': event.data['type'],
        'metadata': event.data['metadata'],
    },
    'last_seen': lambda event: event.datetime,
    'level': lambda event: LOG_LEVELS_MAP.get(
        event.get_tag('level'),
        logging.ERROR,
    ),
    'message': lambda event: event.message,
    'times_seen': lambda event: 0,
}


backfill_fields = {
    'platform': lambda caches, data, event: event.platform,
    'logger': lambda caches, data, event: event.get_tag('logger') or DEFAULT_LOGGER_NAME,
    'first_seen': lambda caches, data, event: event.datetime,
    'active_at': lambda caches, data, event: event.datetime,
    'first_release': lambda caches, data, event: caches['Release'](
        caches['Project'](event.project_id).organization_id,
        event.get_tag('sentry:release'),
    ) if event.get_tag('sentry:release') else data.get('first_release', None),
Exemplo n.º 12
0
    def save(self, project_id, raw=False, assume_normalized=False, start_time=None, cache_key=None):
        """
        After normalizing and processing an event, save adjacent models such as
        releases and environments to postgres and write the event into
        eventstream. From there it will be picked up by Snuba and
        post-processing.

        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize(project_id=project_id)
            self._normalized = True

        with metrics.timer("event_manager.save.project.get_from_cache"):
            project = Project.objects.get_from_cache(id=project_id)

        projects = {project.id: project}

        if self._data.get("type") == "transaction":
            self._data["project"] = int(project_id)
            job = {"data": self._data, "start_time": start_time}
            jobs = save_transaction_events([job], projects)
            return jobs[0]["event"]

        with metrics.timer("event_manager.save.organization.get_from_cache"):
            project._organization_cache = Organization.objects.get_from_cache(
                id=project.organization_id
            )

        job = {"data": self._data, "project_id": project_id, "raw": raw, "start_time": start_time}
        jobs = [job]

        _pull_out_data(jobs, projects)
        _get_or_create_release_many(jobs, projects)
        _get_event_user_many(jobs, projects)

        job["project_key"] = None
        if job["key_id"] is not None:
            with metrics.timer("event_manager.load_project_key"):
                try:
                    job["project_key"] = ProjectKey.objects.get_from_cache(id=job["key_id"])
                except ProjectKey.DoesNotExist:
                    pass

        with metrics.timer("event_manager.load_grouping_config"):
            # At this point we want to normalize the in_app values in case the
            # clients did not set this appropriately so far.
            grouping_config = load_grouping_config(
                get_grouping_config_dict_for_event_data(job["data"], project)
            )

        with metrics.timer("event_manager.normalize_stacktraces_for_grouping"):
            normalize_stacktraces_for_grouping(job["data"], grouping_config)

        _derive_plugin_tags_many(jobs, projects)
        _derive_interface_tags_many(jobs)

        with metrics.timer("event_manager.apply_server_fingerprinting"):
            # The active grouping config was put into the event in the
            # normalize step before.  We now also make sure that the
            # fingerprint was set to `'{{ default }}' just in case someone
            # removed it from the payload.  The call to get_hashes will then
            # look at `grouping_config` to pick the right parameters.
            job["data"]["fingerprint"] = job["data"].get("fingerprint") or ["{{ default }}"]
            apply_server_fingerprinting(job["data"], get_fingerprinting_config_for_project(project))

        with metrics.timer("event_manager.event.get_hashes"):
            # Here we try to use the grouping config that was requested in the
            # event.  If that config has since been deleted (because it was an
            # experimental grouping config) we fall back to the default.
            try:
                hashes = job["event"].get_hashes()
            except GroupingConfigNotFound:
                job["data"]["grouping_config"] = get_grouping_config_dict_for_project(project)
                hashes = job["event"].get_hashes()

        job["data"]["hashes"] = hashes

        _materialize_metadata_many(jobs)

        # The group gets the same metadata as the event when it's flushed but
        # additionally the `last_received` key is set.  This key is used by
        # _save_aggregate.
        group_metadata = dict(job["materialized_metadata"])
        group_metadata["last_received"] = job["received_timestamp"]
        kwargs = {
            "platform": job["platform"],
            "message": job["event"].search_message,
            "culprit": job["culprit"],
            "logger": job["logger_name"],
            "level": LOG_LEVELS_MAP.get(job["level"]),
            "last_seen": job["event"].datetime,
            "first_seen": job["event"].datetime,
            "active_at": job["event"].datetime,
            "data": group_metadata,
        }

        if job["release"]:
            kwargs["first_release"] = job["release"]

        # Load attachments first, but persist them at the very last after
        # posting to eventstream to make sure all counters and eventstream are
        # incremented for sure. Also wait for grouping to remove attachments
        # based on the group counter.
        with metrics.timer("event_manager.get_attachments"):
            attachments = get_attachments(cache_key, job)

        try:
            job["group"], job["is_new"], job["is_regression"] = _save_aggregate(
                event=job["event"], hashes=hashes, release=job["release"], **kwargs
            )
        except HashDiscarded:
            discard_event(job, attachments)
            raise

        job["event"].group = job["group"]

        # store a reference to the group id to guarantee validation of isolation
        # XXX(markus): No clue what this does
        job["event"].data.bind_ref(job["event"])

        _get_or_create_environment_many(jobs, projects)

        if job["group"]:
            group_environment, job["is_new_group_environment"] = GroupEnvironment.get_or_create(
                group_id=job["group"].id,
                environment_id=job["environment"].id,
                defaults={"first_release": job["release"] or None},
            )
        else:
            job["is_new_group_environment"] = False

        _get_or_create_release_associated_models(jobs, projects)

        if job["release"] and job["group"]:
            job["grouprelease"] = GroupRelease.get_or_create(
                group=job["group"],
                release=job["release"],
                environment=job["environment"],
                datetime=job["event"].datetime,
            )

        _tsdb_record_all_metrics(jobs)

        if job["group"]:
            UserReport.objects.filter(project=project, event_id=job["event"].event_id).update(
                group=job["group"], environment=job["environment"]
            )

        with metrics.timer("event_manager.filter_attachments_for_group"):
            attachments = filter_attachments_for_group(attachments, job)

        # XXX: DO NOT MUTATE THE EVENT PAYLOAD AFTER THIS POINT
        _materialize_event_metrics(jobs)

        for attachment in attachments:
            key = "bytes.stored.%s" % (attachment.type,)
            old_bytes = job["event_metrics"].get(key) or 0
            job["event_metrics"][key] = old_bytes + attachment.size

        _nodestore_save_many(jobs)
        save_unprocessed_event(project, event_id=job["event"].event_id)

        if job["release"]:
            if job["is_new"]:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {"release_id": job["release"].id, "project_id": project.id},
                )
            if job["is_new_group_environment"]:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": job["release"].id,
                        "environment_id": job["environment"].id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=job["event"].datetime)
                first_event_received.send_robust(
                    project=project, event=job["event"], sender=Project
                )

        _eventstream_insert_many(jobs)

        # Do this last to ensure signals get emitted even if connection to the
        # file store breaks temporarily.
        with metrics.timer("event_manager.save_attachments"):
            save_attachments(cache_key, attachments, job)

        metric_tags = {"from_relay": "_relay_processed" in job["data"]}

        metrics.timing(
            "events.latency",
            job["received_timestamp"] - job["recorded_timestamp"],
            tags=metric_tags,
        )
        metrics.timing("events.size.data.post_save", job["event"].size, tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(job["data"].get("errors") or ()),
            tags=metric_tags,
        )

        _track_outcome_accepted_many(jobs)

        self._data = job["event"].data.data
        return job["event"]
Exemplo n.º 13
0
    def save(self, project_id, raw=False, assume_normalized=False):
        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        data = self._data

        project = Project.objects.get_from_cache(id=project_id)
        project._organization_cache = Organization.objects.get_from_cache(
            id=project.organization_id)

        # Check to make sure we're not about to do a bunch of work that's
        # already been done if we've processed an event with this ID. (This
        # isn't a perfect solution -- this doesn't handle ``EventMapping`` and
        # there's a race condition between here and when the event is actually
        # saved, but it's an improvement. See GH-7677.)
        try:
            event = Event.objects.get(
                project_id=project.id,
                event_id=data['event_id'],
            )
        except Event.DoesNotExist:
            pass
        else:
            # Make sure we cache on the project before returning
            event._project_cache = project
            logger.info(
                'duplicate.found',
                exc_info=True,
                extra={
                    'event_uuid': data['event_id'],
                    'project_id': project.id,
                    'model': Event.__name__,
                }
            )
            return event

        # Pull out the culprit
        culprit = self.get_culprit()

        # Pull the toplevel data we're interested in
        level = data.get('level')

        # TODO(mitsuhiko): this code path should be gone by July 2018.
        # This is going to be fine because no code actually still depends
        # on integers here.  When we need an integer it will be converted
        # into one later.  Old workers used to send integers here.
        if level is not None and isinstance(level, six.integer_types):
            level = LOG_LEVELS[level]

        transaction_name = data.get('transaction')
        logger_name = data.get('logger')
        release = data.get('release')
        dist = data.get('dist')
        environment = data.get('environment')
        recorded_timestamp = data.get('timestamp')

        # We need to swap out the data with the one internal to the newly
        # created event object
        event = self._get_event_instance(project_id=project_id)
        self._data = data = event.data.data

        event._project_cache = project

        date = event.datetime
        platform = event.platform
        event_id = event.event_id

        if transaction_name:
            transaction_name = force_text(transaction_name)

        # Some of the data that are toplevel attributes are duplicated
        # into tags (logger, level, environment, transaction).  These are
        # different from legacy attributes which are normalized into tags
        # ahead of time (site, server_name).
        setdefault_path(data, 'tags', value=[])
        set_tag(data, 'level', level)
        if logger_name:
            set_tag(data, 'logger', logger_name)
        if environment:
            set_tag(data, 'environment', environment)
        if transaction_name:
            set_tag(data, 'transaction', transaction_name)

        if release:
            # dont allow a conflicting 'release' tag
            pop_tag(data, 'release')
            release = Release.get_or_create(
                project=project,
                version=release,
                date_added=date,
            )
            set_tag(data, 'sentry:release', release.version)

        if dist and release:
            dist = release.add_dist(dist, date)
            # dont allow a conflicting 'dist' tag
            pop_tag(data, 'dist')
            set_tag(data, 'sentry:dist', dist.name)
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            pop_tag(data, 'user')
            set_tag(data, 'sentry:user', event_user.tag_value)

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        grouping_config = load_grouping_config(
            get_grouping_config_dict_for_event_data(data, project))
        normalize_stacktraces_for_grouping(data, grouping_config)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    if get_tag(data, key) is None:
                        set_tag(data, key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                set_tag(data, k, v)
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.path, None)

        # The active grouping config was put into the event in the
        # normalize step before.  We now also make sure that the
        # fingerprint was set to `'{{ default }}' just in case someone
        # removed it from the payload.  The call to get_hashes will then
        # look at `grouping_config` to pick the right paramters.
        data['fingerprint'] = data.get('fingerprint') or ['{{ default }}']
        apply_server_fingerprinting(data, get_fingerprinting_config_for_project(project))
        hashes = event.get_hashes()
        data['hashes'] = hashes

        # we want to freeze not just the metadata and type in but also the
        # derived attributes.  The reason for this is that we push this
        # data into kafka for snuba processing and our postprocessing
        # picks up the data right from the snuba topic.  For most usage
        # however the data is dynamically overriden by Event.title and
        # Event.location (See Event.as_dict)
        materialized_metadata = self.materialize_metadata()
        event_metadata = materialized_metadata['metadata']
        data.update(materialized_metadata)
        data['culprit'] = culprit

        # index components into ``Event.message``
        # See GH-3248
        event.message = self.get_search_message(event_metadata, culprit)
        received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))

        # The group gets the same metadata as the event when it's flushed but
        # additionally the `last_received` key is set.  This key is used by
        # _save_aggregate.
        group_metadata = dict(materialized_metadata)
        group_metadata['last_received'] = received_timestamp
        kwargs = {
            'platform': platform,
            'message': event.message,
            'culprit': culprit,
            'logger': logger_name,
            'level': LOG_LEVELS_MAP.get(level),
            'last_seen': date,
            'first_seen': date,
            'active_at': date,
            'data': group_metadata,
        }

        if release:
            kwargs['first_release'] = release

        try:
            group, is_new, is_regression, is_sample = self._save_aggregate(
                event=event, hashes=hashes, release=release, **kwargs
            )
        except HashDiscarded:
            event_discarded.send_robust(
                project=project,
                sender=EventManager,
            )

            metrics.incr(
                'events.discarded',
                skip_internal=True,
                tags={
                    'organization_id': project.organization_id,
                    'platform': platform,
                },
            )
            raise
        else:
            event_saved.send_robust(
                project=project,
                event_size=event.size,
                sender=EventManager,
            )

        event.group = group
        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        # When an event was sampled, the canonical source of truth
        # is the EventMapping table since we aren't going to be writing out an actual
        # Event row. Otherwise, if the Event isn't being sampled, we can safely
        # rely on the Event table itself as the source of truth and ignore
        # EventMapping since it's redundant information.
        if is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(EventMapping)):
                    EventMapping.objects.create(project=project, group=group, event_id=event_id)
            except IntegrityError:
                logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': EventMapping.__name__,
                    }
                )
                return event

        environment = Environment.get_or_create(
            project=project,
            name=environment,
        )

        group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
            group_id=group.id,
            environment_id=environment.id,
            defaults={
                'first_release': release if release else None,
            },
        )

        if release:
            ReleaseEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            ReleaseProjectEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            grouprelease = GroupRelease.get_or_create(
                group=group,
                release=release,
                environment=environment,
                datetime=date,
            )

        counters = [
            (tsdb.models.group, group.id),
            (tsdb.models.project, project.id),
        ]

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id)

        frequencies = [
            # (tsdb.models.frequent_projects_by_organization, {
            #     project.organization_id: {
            #         project.id: 1,
            #     },
            # }),
            # (tsdb.models.frequent_issues_by_project, {
            #     project.id: {
            #         group.id: 1,
            #     },
            # })
            (tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1,
                },
            })
        ]

        if release:
            frequencies.append(
                (tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1,
                    },
                })
            )

        tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        UserReport.objects.filter(
            project=project,
            event_id=event_id,
        ).update(
            group=group,
            environment=environment,
        )

        # save the event unless its been sampled
        if not is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(Event)):
                    event.save()
            except IntegrityError:
                logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': Event.__name__,
                    }
                )
                return event

            tagstore.delay_index_event_tags(
                organization_id=project.organization_id,
                project_id=project.id,
                group_id=group.id,
                environment_id=environment.id,
                event_id=event.id,
                tags=event.tags,
                date_added=event.datetime,
            )

        if event_user:
            tsdb.record_multi(
                (
                    (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )),
                    (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )),
                ),
                timestamp=event.datetime,
                environment_id=environment.id,
            )
        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject, {'new_groups': 1}, {
                        'release_id': release.id,
                        'project_id': project.id,
                    }
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment, {'new_issues_count': 1}, {
                        'project_id': project.id,
                        'release_id': release.id,
                        'environment_id': environment.id,
                    }
                )

        safe_execute(
            Group.objects.add_tags,
            group,
            environment,
            event.get_tags(),
            _with_transaction=False)

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project, group=group, sender=Project)

        eventstream.insert(
            group=group,
            event=event,
            is_new=is_new,
            is_sample=is_sample,
            is_regression=is_regression,
            is_new_group_environment=is_new_group_environment,
            primary_hash=hashes[0],
            # We are choosing to skip consuming the event back
            # in the eventstream if it's flagged as raw.
            # This means that we want to publish the event
            # through the event stream, but we don't care
            # about post processing and handling the commit.
            skip_consume=raw,
        )

        metrics.timing(
            'events.latency',
            received_timestamp - recorded_timestamp,
            tags={
                'project_id': project.id,
            },
        )

        metrics.timing(
            'events.size.data.post_save',
            event.size,
            tags={'project_id': project.id}
        )

        return event