Пример #1
0
def add_metrics(resource_id, request_params, dataset, proto_version):
    current_time = datetime.utcnow().replace(second=0, microsecond=0)
    try:
        resource = ApplicationService.by_id_cached()(resource_id)
        resource = DBSession.merge(resource, load=False)
        es_docs = []
        rows = []
        for metric in dataset:
            tags = dict(metric["tags"])
            server_n = tags.get("server_name", metric["server_name"]).lower()
            tags["server_name"] = server_n or "unknown"
            new_metric = Metric(
                timestamp=metric["timestamp"],
                resource_id=resource.resource_id,
                namespace=metric["namespace"],
                tags=tags,
            )
            rows.append(new_metric)
            es_docs.append(new_metric.es_doc())
        session = DBSession()
        session.bulk_save_objects(rows)
        session.flush()

        action = "METRICS"
        metrics_msg = "%s: %s, metrics: %s, proto:%s" % (
            action,
            str(resource),
            len(dataset),
            proto_version,
        )
        log.info(metrics_msg)

        mark_changed(session)
        redis_pipeline = Datastores.redis.pipeline(transaction=False)
        key = REDIS_KEYS["counters"]["metrics_per_minute"].format(current_time)
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600 * 24)
        key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format(
            resource.owner_user_id, current_time)
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600)
        key = REDIS_KEYS["counters"]["metrics_per_hour_per_app"].format(
            resource_id, current_time.replace(minute=0))
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600 * 24 * 7)
        redis_pipeline.sadd(
            REDIS_KEYS["apps_that_got_new_data_per_hour"].format(
                current_time.replace(minute=0)),
            resource_id,
        )
        redis_pipeline.execute()
        add_metrics_es(es_docs)
        return True
    except Exception as exc:
        print_traceback(log)
        if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]:
            raise
        add_metrics.retry(exc=exc)
Пример #2
0
def check_user_report_notifications(resource_id):
    since_when = datetime.utcnow()
    try:
        request = get_current_request()
        application = ApplicationService.by_id(resource_id)
        if not application:
            return
        error_key = REDIS_KEYS["reports_to_notify_per_type_per_app"].format(
            ReportType.error, resource_id)
        slow_key = REDIS_KEYS["reports_to_notify_per_type_per_app"].format(
            ReportType.slow, resource_id)
        error_group_ids = Datastores.redis.smembers(error_key)
        slow_group_ids = Datastores.redis.smembers(slow_key)
        Datastores.redis.delete(error_key)
        Datastores.redis.delete(slow_key)
        err_gids = [int(g_id) for g_id in error_group_ids]
        slow_gids = [int(g_id) for g_id in list(slow_group_ids)]
        group_ids = err_gids + slow_gids
        occurence_dict = {}
        for g_id in group_ids:
            key = REDIS_KEYS["counters"]["report_group_occurences"].format(
                g_id)
            val = Datastores.redis.get(key)
            Datastores.redis.delete(key)
            if val:
                occurence_dict[g_id] = int(val)
            else:
                occurence_dict[g_id] = 1
        report_groups = ReportGroupService.by_ids(group_ids)
        report_groups.options(sa.orm.joinedload(ReportGroup.last_report_ref))

        ApplicationService.check_for_groups_alert(
            application,
            "alert",
            report_groups=report_groups,
            occurence_dict=occurence_dict,
        )
        users = set([
            p.user
            for p in ResourceService.users_for_perm(application, "view")
        ])
        report_groups = report_groups.all()
        for user in users:
            UserService.report_notify(
                user,
                request,
                application,
                report_groups=report_groups,
                occurence_dict=occurence_dict,
            )
        for group in report_groups:
            # marks report_groups as notified
            if not group.notified:
                group.notified = True
    except Exception as exc:
        print_traceback(log)
        raise
Пример #3
0
def add_metrics(resource_id, request_params, dataset, proto_version):
    current_time = datetime.utcnow().replace(second=0, microsecond=0)
    try:
        resource = ApplicationService.by_id_cached()(resource_id)
        resource = DBSession.merge(resource, load=False)
        es_docs = []
        rows = []
        for metric in dataset:
            tags = dict(metric['tags'])
            server_n = tags.get('server_name', metric['server_name']).lower()
            tags['server_name'] = server_n or 'unknown'
            new_metric = Metric(timestamp=metric['timestamp'],
                                resource_id=resource.resource_id,
                                namespace=metric['namespace'],
                                tags=tags)
            rows.append(new_metric)
            es_docs.append(new_metric.es_doc())
        session = DBSession()
        session.bulk_save_objects(rows)
        session.flush()

        action = 'METRICS'
        metrics_msg = '%s: %s, metrics: %s, proto:%s' % (
            action, str(resource), len(dataset), proto_version)
        log.info(metrics_msg)

        mark_changed(session)
        redis_pipeline = Datastores.redis.pipeline(transaction=False)
        key = REDIS_KEYS['counters']['metrics_per_minute'].format(current_time)
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600 * 24)
        key = REDIS_KEYS['counters']['events_per_minute_per_user'].format(
            resource.owner_user_id, current_time)
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600)
        key = REDIS_KEYS['counters']['metrics_per_hour_per_app'].format(
            resource_id, current_time.replace(minute=0))
        redis_pipeline.incr(key, len(rows))
        redis_pipeline.expire(key, 3600 * 24 * 7)
        redis_pipeline.sadd(
            REDIS_KEYS['apps_that_got_new_data_per_hour'].format(
                current_time.replace(minute=0)), resource_id)
        redis_pipeline.execute()
        add_metrics_es(es_docs)
        return True
    except Exception as exc:
        print_traceback(log)
        add_metrics.retry(exc=exc)
Пример #4
0
def add_uptime_stats(params, metric):
    proto_version = parse_proto(params.get("protocol_version"))
    try:
        application = ApplicationService.by_id_cached()(metric["resource_id"])
        application = DBSession.merge(application, load=False)
        if not application:
            return
        start_interval = convert_date(metric["timestamp"])
        start_interval = start_interval.replace(second=0, microsecond=0)
        new_metric = UptimeMetric(
            start_interval=start_interval,
            response_time=metric["response_time"],
            status_code=metric["status_code"],
            is_ok=metric["is_ok"],
            location=metric.get("location", 1),
            tries=metric["tries"],
            resource_id=application.resource_id,
            owner_user_id=application.owner_user_id,
        )
        DBSession.add(new_metric)
        DBSession.flush()
        add_metrics_uptime([new_metric.es_doc()])
        if metric["is_ok"]:
            event_types = [Event.types["uptime_alert"]]
            statuses = [Event.statuses["active"]]
            # get events older than 5 min
            events = EventService.by_type_and_status(
                event_types,
                statuses,
                older_than=(datetime.utcnow() - timedelta(minutes=6)),
                app_ids=[application.resource_id],
            )
            for event in events:
                event.close()
        else:
            UptimeMetricService.check_for_alert(application, metric=metric)
        action = "METRICS UPTIME"
        metrics_msg = "%s: %s, proto:%s" % (action, str(application),
                                            proto_version)
        log.info(metrics_msg)
        session = DBSession()
        mark_changed(session)
        return True
    except Exception as exc:
        print_traceback(log)
        add_uptime_stats.retry(exc=exc)
Пример #5
0
def update_tag_counter(tag_name, tag_value, count):
    try:
        query = DBSession.query(Tag).filter(Tag.name == tag_name).filter(
            sa.cast(Tag.value, sa.types.TEXT) == sa.cast(
                json.dumps(tag_value), sa.types.TEXT))
        query.update(
            {
                'times_seen': Tag.times_seen + count,
                'last_timestamp': datetime.utcnow()
            },
            synchronize_session=False)
        session = DBSession()
        mark_changed(session)
        return True
    except Exception as exc:
        print_traceback(log)
        update_tag_counter.retry(exc=exc)
Пример #6
0
def check_alerts(resource_id):
    since_when = datetime.utcnow()
    try:
        request = get_current_request()
        application = ApplicationService.by_id(resource_id)
        if not application:
            return
        error_key = REDIS_KEYS[
            "reports_to_notify_per_type_per_app_alerting"].format(
                ReportType.error, resource_id)
        slow_key = REDIS_KEYS[
            "reports_to_notify_per_type_per_app_alerting"].format(
                ReportType.slow, resource_id)
        error_group_ids = Datastores.redis.smembers(error_key)
        slow_group_ids = Datastores.redis.smembers(slow_key)
        Datastores.redis.delete(error_key)
        Datastores.redis.delete(slow_key)
        err_gids = [int(g_id) for g_id in error_group_ids]
        slow_gids = [int(g_id) for g_id in list(slow_group_ids)]
        group_ids = err_gids + slow_gids
        occurence_dict = {}
        for g_id in group_ids:
            key = REDIS_KEYS["counters"][
                "report_group_occurences_alerting"].format(g_id)
            val = Datastores.redis.get(key)
            Datastores.redis.delete(key)
            if val:
                occurence_dict[g_id] = int(val)
            else:
                occurence_dict[g_id] = 1
        report_groups = ReportGroupService.by_ids(group_ids)
        report_groups.options(sa.orm.joinedload(ReportGroup.last_report_ref))

        ApplicationService.check_for_groups_alert(
            application,
            "alert",
            report_groups=report_groups,
            occurence_dict=occurence_dict,
            since_when=since_when,
        )
    except Exception as exc:
        print_traceback(log)
        raise
Пример #7
0
def update_tag_counter(tag_name, tag_value, count):
    try:
        query = (DBSession.query(Tag).filter(Tag.name == tag_name).filter(
            sa.cast(Tag.value, sa.types.TEXT) == sa.cast(
                json.dumps(tag_value), sa.types.TEXT)))
        query.update(
            {
                "times_seen": Tag.times_seen + count,
                "last_timestamp": datetime.utcnow()
            },
            synchronize_session=False,
        )
        session = DBSession()
        mark_changed(session)
        return True
    except Exception as exc:
        print_traceback(log)
        if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]:
            raise
        update_tag_counter.retry(exc=exc)
Пример #8
0
def close_alerts():
    log.warning("Checking alerts")
    since_when = datetime.utcnow()
    try:
        event_types = [
            Event.types["error_report_alert"],
            Event.types["slow_report_alert"],
        ]
        statuses = [Event.statuses["active"]]
        # get events older than 5 min
        events = EventService.by_type_and_status(
            event_types,
            statuses,
            older_than=(since_when - timedelta(minutes=5)))
        for event in events:
            # see if we can close them
            event.validate_or_close(since_when=(since_when -
                                                timedelta(minutes=1)))
    except Exception as exc:
        print_traceback(log)
        raise
Пример #9
0
def add_reports(resource_id, request_params, dataset, **kwargs):
    proto_version = parse_proto(request_params.get("protocol_version", ""))
    current_time = datetime.utcnow().replace(second=0, microsecond=0)
    try:
        # we will store solr docs here for single insert
        es_report_docs = {}
        es_report_group_docs = {}
        resource = ApplicationService.by_id(resource_id)

        tags = []
        es_slow_calls_docs = {}
        es_reports_stats_rows = {}
        for report_data in dataset:
            # build report details for later
            added_details = 0
            report = Report()
            report.set_data(report_data, resource, proto_version)
            report._skip_ft_index = True

            # find latest group in this months partition
            report_group = ReportGroupService.by_hash_and_resource(
                report.resource_id,
                report.grouping_hash,
                since_when=datetime.utcnow().date().replace(day=1),
            )
            occurences = report_data.get("occurences", 1)
            if not report_group:
                # total reports will be +1 moment later
                report_group = ReportGroup(
                    grouping_hash=report.grouping_hash,
                    occurences=0,
                    total_reports=0,
                    last_report=0,
                    priority=report.priority,
                    error=report.error,
                    first_timestamp=report.start_time,
                )
                report_group._skip_ft_index = True
                report_group.report_type = report.report_type
            report.report_group_time = report_group.first_timestamp
            add_sample = pick_sample(report_group.occurences,
                                     report_type=report_group.report_type)
            if add_sample:
                resource.report_groups.append(report_group)
                report_group.reports.append(report)
                added_details += 1
                DBSession.flush()
                if report.partition_id not in es_report_docs:
                    es_report_docs[report.partition_id] = []
                es_report_docs[report.partition_id].append(report.es_doc())
                tags.extend(list(report.tags.items()))
                slow_calls = report.add_slow_calls(report_data, report_group)
                DBSession.flush()
                for s_call in slow_calls:
                    if s_call.partition_id not in es_slow_calls_docs:
                        es_slow_calls_docs[s_call.partition_id] = []
                    es_slow_calls_docs[s_call.partition_id].append(
                        s_call.es_doc())
                    # try generating new stat rows if needed
            else:
                # required for postprocessing to not fail later
                report.report_group = report_group

            stat_row = ReportService.generate_stat_rows(
                report, resource, report_group)
            if stat_row.partition_id not in es_reports_stats_rows:
                es_reports_stats_rows[stat_row.partition_id] = []
            es_reports_stats_rows[stat_row.partition_id].append(
                stat_row.es_doc())

            # see if we should mark 10th occurence of report
            last_occurences_10 = int(math.floor(report_group.occurences / 10))
            curr_occurences_10 = int(
                math.floor((report_group.occurences + report.occurences) / 10))
            last_occurences_100 = int(math.floor(report_group.occurences /
                                                 100))
            curr_occurences_100 = int(
                math.floor(
                    (report_group.occurences + report.occurences) / 100))
            notify_occurences_10 = last_occurences_10 != curr_occurences_10
            notify_occurences_100 = last_occurences_100 != curr_occurences_100
            report_group.occurences = ReportGroup.occurences + occurences
            report_group.last_timestamp = report.start_time
            report_group.summed_duration = ReportGroup.summed_duration + report.duration
            summed_duration = ReportGroup.summed_duration + report.duration
            summed_occurences = ReportGroup.occurences + occurences
            report_group.average_duration = summed_duration / summed_occurences
            report_group.run_postprocessing(report)
            if added_details:
                report_group.total_reports = ReportGroup.total_reports + 1
                report_group.last_report = report.id
            report_group.set_notification_info(
                notify_10=notify_occurences_10,
                notify_100=notify_occurences_100)
            DBSession.flush()
            report_group.get_report().notify_channel(report_group)
            if report_group.partition_id not in es_report_group_docs:
                es_report_group_docs[report_group.partition_id] = []
            es_report_group_docs[report_group.partition_id].append(
                report_group.es_doc())

            action = "REPORT"
            log_msg = "%s: %s %s, client: %s, proto: %s" % (
                action,
                report_data.get("http_status", "unknown"),
                str(resource),
                report_data.get("client"),
                proto_version,
            )
            log.info(log_msg)
        total_reports = len(dataset)
        redis_pipeline = Datastores.redis.pipeline(transaction=False)
        key = REDIS_KEYS["counters"]["reports_per_minute"].format(current_time)
        redis_pipeline.incr(key, total_reports)
        redis_pipeline.expire(key, 3600 * 24)
        key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format(
            resource.owner_user_id, current_time)
        redis_pipeline.incr(key, total_reports)
        redis_pipeline.expire(key, 3600)
        key = REDIS_KEYS["counters"]["reports_per_hour_per_app"].format(
            resource_id, current_time.replace(minute=0))
        redis_pipeline.incr(key, total_reports)
        redis_pipeline.expire(key, 3600 * 24 * 7)
        redis_pipeline.sadd(
            REDIS_KEYS["apps_that_got_new_data_per_hour"].format(
                current_time.replace(minute=0)),
            resource_id,
        )
        redis_pipeline.execute()

        add_reports_es(es_report_group_docs, es_report_docs)
        add_reports_slow_calls_es(es_slow_calls_docs)
        add_reports_stats_rows_es(es_reports_stats_rows)
        return True
    except Exception as exc:
        print_traceback(log)
        if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]:
            raise
        add_reports.retry(exc=exc)
Пример #10
0
def add_logs(resource_id, request_params, dataset, **kwargs):
    proto_version = request_params.get("protocol_version")
    current_time = datetime.utcnow().replace(second=0, microsecond=0)

    try:
        es_docs = collections.defaultdict(list)
        resource = ApplicationService.by_id_cached()(resource_id)
        resource = DBSession.merge(resource, load=False)
        ns_pairs = []
        for entry in dataset:
            # gather pk and ns so we can remove older versions of row later
            if entry["primary_key"] is not None:
                ns_pairs.append({
                    "pk": entry["primary_key"],
                    "ns": entry["namespace"]
                })
            log_entry = Log()
            log_entry.set_data(entry, resource=resource)
            log_entry._skip_ft_index = True
            resource.logs.append(log_entry)
            DBSession.flush()
            # insert non pk rows first
            if entry["primary_key"] is None:
                es_docs[log_entry.partition_id].append(log_entry.es_doc())

        # 2nd pass to delete all log entries from db for same pk/ns pair
        if ns_pairs:
            ids_to_delete = []
            es_docs = collections.defaultdict(list)
            es_docs_to_delete = collections.defaultdict(list)
            found_pkey_logs = LogService.query_by_primary_key_and_namespace(
                list_of_pairs=ns_pairs)
            log_dict = {}
            for log_entry in found_pkey_logs:
                log_key = (log_entry.primary_key, log_entry.namespace)
                if log_key not in log_dict:
                    log_dict[log_key] = []
                log_dict[log_key].append(log_entry)

            for ns, entry_list in log_dict.items():
                entry_list = sorted(entry_list, key=lambda x: x.timestamp)
                # newest row needs to be indexed in es
                log_entry = entry_list[-1]
                # delete everything from pg and ES, leave the last row in pg
                for e in entry_list[:-1]:
                    ids_to_delete.append(e.log_id)
                    es_docs_to_delete[e.partition_id].append(e.delete_hash)

                es_docs_to_delete[log_entry.partition_id].append(
                    log_entry.delete_hash)

                es_docs[log_entry.partition_id].append(log_entry.es_doc())

            if ids_to_delete:
                query = DBSession.query(Log).filter(
                    Log.log_id.in_(ids_to_delete))
                query.delete(synchronize_session=False)
            if es_docs_to_delete:
                # batch this to avoid problems with default ES bulk limits
                for es_index in es_docs_to_delete.keys():
                    for batch in in_batches(es_docs_to_delete[es_index], 20):
                        query = {"query": {"terms": {"delete_hash": batch}}}

                        try:
                            Datastores.es.delete_by_query(
                                index=es_index,
                                doc_type="log",
                                body=query,
                                conflicts="proceed",
                            )
                        except elasticsearch.exceptions.NotFoundError as exc:
                            msg = "skipping index {}".format(es_index)
                            log.info(msg)

        total_logs = len(dataset)

        log_msg = "LOG_NEW: %s, entries: %s, proto:%s" % (
            str(resource),
            total_logs,
            proto_version,
        )
        log.info(log_msg)
        # mark_changed(session)
        redis_pipeline = Datastores.redis.pipeline(transaction=False)
        key = REDIS_KEYS["counters"]["logs_per_minute"].format(current_time)
        redis_pipeline.incr(key, total_logs)
        redis_pipeline.expire(key, 3600 * 24)
        key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format(
            resource.owner_user_id, current_time)
        redis_pipeline.incr(key, total_logs)
        redis_pipeline.expire(key, 3600)
        key = REDIS_KEYS["counters"]["logs_per_hour_per_app"].format(
            resource_id, current_time.replace(minute=0))
        redis_pipeline.incr(key, total_logs)
        redis_pipeline.expire(key, 3600 * 24 * 7)
        redis_pipeline.sadd(
            REDIS_KEYS["apps_that_got_new_data_per_hour"].format(
                current_time.replace(minute=0)),
            resource_id,
        )
        redis_pipeline.execute()
        add_logs_es(es_docs)
        return True
    except Exception as exc:
        print_traceback(log)
        if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]:
            raise
        add_logs.retry(exc=exc)