def add_metrics(resource_id, request_params, dataset, proto_version): current_time = datetime.utcnow().replace(second=0, microsecond=0) try: resource = ApplicationService.by_id_cached()(resource_id) resource = DBSession.merge(resource, load=False) es_docs = [] rows = [] for metric in dataset: tags = dict(metric["tags"]) server_n = tags.get("server_name", metric["server_name"]).lower() tags["server_name"] = server_n or "unknown" new_metric = Metric( timestamp=metric["timestamp"], resource_id=resource.resource_id, namespace=metric["namespace"], tags=tags, ) rows.append(new_metric) es_docs.append(new_metric.es_doc()) session = DBSession() session.bulk_save_objects(rows) session.flush() action = "METRICS" metrics_msg = "%s: %s, metrics: %s, proto:%s" % ( action, str(resource), len(dataset), proto_version, ) log.info(metrics_msg) mark_changed(session) redis_pipeline = Datastores.redis.pipeline(transaction=False) key = REDIS_KEYS["counters"]["metrics_per_minute"].format(current_time) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600 * 24) key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format( resource.owner_user_id, current_time) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600) key = REDIS_KEYS["counters"]["metrics_per_hour_per_app"].format( resource_id, current_time.replace(minute=0)) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600 * 24 * 7) redis_pipeline.sadd( REDIS_KEYS["apps_that_got_new_data_per_hour"].format( current_time.replace(minute=0)), resource_id, ) redis_pipeline.execute() add_metrics_es(es_docs) return True except Exception as exc: print_traceback(log) if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]: raise add_metrics.retry(exc=exc)
def check_user_report_notifications(resource_id): since_when = datetime.utcnow() try: request = get_current_request() application = ApplicationService.by_id(resource_id) if not application: return error_key = REDIS_KEYS["reports_to_notify_per_type_per_app"].format( ReportType.error, resource_id) slow_key = REDIS_KEYS["reports_to_notify_per_type_per_app"].format( ReportType.slow, resource_id) error_group_ids = Datastores.redis.smembers(error_key) slow_group_ids = Datastores.redis.smembers(slow_key) Datastores.redis.delete(error_key) Datastores.redis.delete(slow_key) err_gids = [int(g_id) for g_id in error_group_ids] slow_gids = [int(g_id) for g_id in list(slow_group_ids)] group_ids = err_gids + slow_gids occurence_dict = {} for g_id in group_ids: key = REDIS_KEYS["counters"]["report_group_occurences"].format( g_id) val = Datastores.redis.get(key) Datastores.redis.delete(key) if val: occurence_dict[g_id] = int(val) else: occurence_dict[g_id] = 1 report_groups = ReportGroupService.by_ids(group_ids) report_groups.options(sa.orm.joinedload(ReportGroup.last_report_ref)) ApplicationService.check_for_groups_alert( application, "alert", report_groups=report_groups, occurence_dict=occurence_dict, ) users = set([ p.user for p in ResourceService.users_for_perm(application, "view") ]) report_groups = report_groups.all() for user in users: UserService.report_notify( user, request, application, report_groups=report_groups, occurence_dict=occurence_dict, ) for group in report_groups: # marks report_groups as notified if not group.notified: group.notified = True except Exception as exc: print_traceback(log) raise
def add_metrics(resource_id, request_params, dataset, proto_version): current_time = datetime.utcnow().replace(second=0, microsecond=0) try: resource = ApplicationService.by_id_cached()(resource_id) resource = DBSession.merge(resource, load=False) es_docs = [] rows = [] for metric in dataset: tags = dict(metric['tags']) server_n = tags.get('server_name', metric['server_name']).lower() tags['server_name'] = server_n or 'unknown' new_metric = Metric(timestamp=metric['timestamp'], resource_id=resource.resource_id, namespace=metric['namespace'], tags=tags) rows.append(new_metric) es_docs.append(new_metric.es_doc()) session = DBSession() session.bulk_save_objects(rows) session.flush() action = 'METRICS' metrics_msg = '%s: %s, metrics: %s, proto:%s' % ( action, str(resource), len(dataset), proto_version) log.info(metrics_msg) mark_changed(session) redis_pipeline = Datastores.redis.pipeline(transaction=False) key = REDIS_KEYS['counters']['metrics_per_minute'].format(current_time) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600 * 24) key = REDIS_KEYS['counters']['events_per_minute_per_user'].format( resource.owner_user_id, current_time) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600) key = REDIS_KEYS['counters']['metrics_per_hour_per_app'].format( resource_id, current_time.replace(minute=0)) redis_pipeline.incr(key, len(rows)) redis_pipeline.expire(key, 3600 * 24 * 7) redis_pipeline.sadd( REDIS_KEYS['apps_that_got_new_data_per_hour'].format( current_time.replace(minute=0)), resource_id) redis_pipeline.execute() add_metrics_es(es_docs) return True except Exception as exc: print_traceback(log) add_metrics.retry(exc=exc)
def add_uptime_stats(params, metric): proto_version = parse_proto(params.get("protocol_version")) try: application = ApplicationService.by_id_cached()(metric["resource_id"]) application = DBSession.merge(application, load=False) if not application: return start_interval = convert_date(metric["timestamp"]) start_interval = start_interval.replace(second=0, microsecond=0) new_metric = UptimeMetric( start_interval=start_interval, response_time=metric["response_time"], status_code=metric["status_code"], is_ok=metric["is_ok"], location=metric.get("location", 1), tries=metric["tries"], resource_id=application.resource_id, owner_user_id=application.owner_user_id, ) DBSession.add(new_metric) DBSession.flush() add_metrics_uptime([new_metric.es_doc()]) if metric["is_ok"]: event_types = [Event.types["uptime_alert"]] statuses = [Event.statuses["active"]] # get events older than 5 min events = EventService.by_type_and_status( event_types, statuses, older_than=(datetime.utcnow() - timedelta(minutes=6)), app_ids=[application.resource_id], ) for event in events: event.close() else: UptimeMetricService.check_for_alert(application, metric=metric) action = "METRICS UPTIME" metrics_msg = "%s: %s, proto:%s" % (action, str(application), proto_version) log.info(metrics_msg) session = DBSession() mark_changed(session) return True except Exception as exc: print_traceback(log) add_uptime_stats.retry(exc=exc)
def update_tag_counter(tag_name, tag_value, count): try: query = DBSession.query(Tag).filter(Tag.name == tag_name).filter( sa.cast(Tag.value, sa.types.TEXT) == sa.cast( json.dumps(tag_value), sa.types.TEXT)) query.update( { 'times_seen': Tag.times_seen + count, 'last_timestamp': datetime.utcnow() }, synchronize_session=False) session = DBSession() mark_changed(session) return True except Exception as exc: print_traceback(log) update_tag_counter.retry(exc=exc)
def check_alerts(resource_id): since_when = datetime.utcnow() try: request = get_current_request() application = ApplicationService.by_id(resource_id) if not application: return error_key = REDIS_KEYS[ "reports_to_notify_per_type_per_app_alerting"].format( ReportType.error, resource_id) slow_key = REDIS_KEYS[ "reports_to_notify_per_type_per_app_alerting"].format( ReportType.slow, resource_id) error_group_ids = Datastores.redis.smembers(error_key) slow_group_ids = Datastores.redis.smembers(slow_key) Datastores.redis.delete(error_key) Datastores.redis.delete(slow_key) err_gids = [int(g_id) for g_id in error_group_ids] slow_gids = [int(g_id) for g_id in list(slow_group_ids)] group_ids = err_gids + slow_gids occurence_dict = {} for g_id in group_ids: key = REDIS_KEYS["counters"][ "report_group_occurences_alerting"].format(g_id) val = Datastores.redis.get(key) Datastores.redis.delete(key) if val: occurence_dict[g_id] = int(val) else: occurence_dict[g_id] = 1 report_groups = ReportGroupService.by_ids(group_ids) report_groups.options(sa.orm.joinedload(ReportGroup.last_report_ref)) ApplicationService.check_for_groups_alert( application, "alert", report_groups=report_groups, occurence_dict=occurence_dict, since_when=since_when, ) except Exception as exc: print_traceback(log) raise
def update_tag_counter(tag_name, tag_value, count): try: query = (DBSession.query(Tag).filter(Tag.name == tag_name).filter( sa.cast(Tag.value, sa.types.TEXT) == sa.cast( json.dumps(tag_value), sa.types.TEXT))) query.update( { "times_seen": Tag.times_seen + count, "last_timestamp": datetime.utcnow() }, synchronize_session=False, ) session = DBSession() mark_changed(session) return True except Exception as exc: print_traceback(log) if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]: raise update_tag_counter.retry(exc=exc)
def close_alerts(): log.warning("Checking alerts") since_when = datetime.utcnow() try: event_types = [ Event.types["error_report_alert"], Event.types["slow_report_alert"], ] statuses = [Event.statuses["active"]] # get events older than 5 min events = EventService.by_type_and_status( event_types, statuses, older_than=(since_when - timedelta(minutes=5))) for event in events: # see if we can close them event.validate_or_close(since_when=(since_when - timedelta(minutes=1))) except Exception as exc: print_traceback(log) raise
def add_reports(resource_id, request_params, dataset, **kwargs): proto_version = parse_proto(request_params.get("protocol_version", "")) current_time = datetime.utcnow().replace(second=0, microsecond=0) try: # we will store solr docs here for single insert es_report_docs = {} es_report_group_docs = {} resource = ApplicationService.by_id(resource_id) tags = [] es_slow_calls_docs = {} es_reports_stats_rows = {} for report_data in dataset: # build report details for later added_details = 0 report = Report() report.set_data(report_data, resource, proto_version) report._skip_ft_index = True # find latest group in this months partition report_group = ReportGroupService.by_hash_and_resource( report.resource_id, report.grouping_hash, since_when=datetime.utcnow().date().replace(day=1), ) occurences = report_data.get("occurences", 1) if not report_group: # total reports will be +1 moment later report_group = ReportGroup( grouping_hash=report.grouping_hash, occurences=0, total_reports=0, last_report=0, priority=report.priority, error=report.error, first_timestamp=report.start_time, ) report_group._skip_ft_index = True report_group.report_type = report.report_type report.report_group_time = report_group.first_timestamp add_sample = pick_sample(report_group.occurences, report_type=report_group.report_type) if add_sample: resource.report_groups.append(report_group) report_group.reports.append(report) added_details += 1 DBSession.flush() if report.partition_id not in es_report_docs: es_report_docs[report.partition_id] = [] es_report_docs[report.partition_id].append(report.es_doc()) tags.extend(list(report.tags.items())) slow_calls = report.add_slow_calls(report_data, report_group) DBSession.flush() for s_call in slow_calls: if s_call.partition_id not in es_slow_calls_docs: es_slow_calls_docs[s_call.partition_id] = [] es_slow_calls_docs[s_call.partition_id].append( s_call.es_doc()) # try generating new stat rows if needed else: # required for postprocessing to not fail later report.report_group = report_group stat_row = ReportService.generate_stat_rows( report, resource, report_group) if stat_row.partition_id not in es_reports_stats_rows: es_reports_stats_rows[stat_row.partition_id] = [] es_reports_stats_rows[stat_row.partition_id].append( stat_row.es_doc()) # see if we should mark 10th occurence of report last_occurences_10 = int(math.floor(report_group.occurences / 10)) curr_occurences_10 = int( math.floor((report_group.occurences + report.occurences) / 10)) last_occurences_100 = int(math.floor(report_group.occurences / 100)) curr_occurences_100 = int( math.floor( (report_group.occurences + report.occurences) / 100)) notify_occurences_10 = last_occurences_10 != curr_occurences_10 notify_occurences_100 = last_occurences_100 != curr_occurences_100 report_group.occurences = ReportGroup.occurences + occurences report_group.last_timestamp = report.start_time report_group.summed_duration = ReportGroup.summed_duration + report.duration summed_duration = ReportGroup.summed_duration + report.duration summed_occurences = ReportGroup.occurences + occurences report_group.average_duration = summed_duration / summed_occurences report_group.run_postprocessing(report) if added_details: report_group.total_reports = ReportGroup.total_reports + 1 report_group.last_report = report.id report_group.set_notification_info( notify_10=notify_occurences_10, notify_100=notify_occurences_100) DBSession.flush() report_group.get_report().notify_channel(report_group) if report_group.partition_id not in es_report_group_docs: es_report_group_docs[report_group.partition_id] = [] es_report_group_docs[report_group.partition_id].append( report_group.es_doc()) action = "REPORT" log_msg = "%s: %s %s, client: %s, proto: %s" % ( action, report_data.get("http_status", "unknown"), str(resource), report_data.get("client"), proto_version, ) log.info(log_msg) total_reports = len(dataset) redis_pipeline = Datastores.redis.pipeline(transaction=False) key = REDIS_KEYS["counters"]["reports_per_minute"].format(current_time) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600 * 24) key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format( resource.owner_user_id, current_time) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600) key = REDIS_KEYS["counters"]["reports_per_hour_per_app"].format( resource_id, current_time.replace(minute=0)) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600 * 24 * 7) redis_pipeline.sadd( REDIS_KEYS["apps_that_got_new_data_per_hour"].format( current_time.replace(minute=0)), resource_id, ) redis_pipeline.execute() add_reports_es(es_report_group_docs, es_report_docs) add_reports_slow_calls_es(es_slow_calls_docs) add_reports_stats_rows_es(es_reports_stats_rows) return True except Exception as exc: print_traceback(log) if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]: raise add_reports.retry(exc=exc)
def add_logs(resource_id, request_params, dataset, **kwargs): proto_version = request_params.get("protocol_version") current_time = datetime.utcnow().replace(second=0, microsecond=0) try: es_docs = collections.defaultdict(list) resource = ApplicationService.by_id_cached()(resource_id) resource = DBSession.merge(resource, load=False) ns_pairs = [] for entry in dataset: # gather pk and ns so we can remove older versions of row later if entry["primary_key"] is not None: ns_pairs.append({ "pk": entry["primary_key"], "ns": entry["namespace"] }) log_entry = Log() log_entry.set_data(entry, resource=resource) log_entry._skip_ft_index = True resource.logs.append(log_entry) DBSession.flush() # insert non pk rows first if entry["primary_key"] is None: es_docs[log_entry.partition_id].append(log_entry.es_doc()) # 2nd pass to delete all log entries from db for same pk/ns pair if ns_pairs: ids_to_delete = [] es_docs = collections.defaultdict(list) es_docs_to_delete = collections.defaultdict(list) found_pkey_logs = LogService.query_by_primary_key_and_namespace( list_of_pairs=ns_pairs) log_dict = {} for log_entry in found_pkey_logs: log_key = (log_entry.primary_key, log_entry.namespace) if log_key not in log_dict: log_dict[log_key] = [] log_dict[log_key].append(log_entry) for ns, entry_list in log_dict.items(): entry_list = sorted(entry_list, key=lambda x: x.timestamp) # newest row needs to be indexed in es log_entry = entry_list[-1] # delete everything from pg and ES, leave the last row in pg for e in entry_list[:-1]: ids_to_delete.append(e.log_id) es_docs_to_delete[e.partition_id].append(e.delete_hash) es_docs_to_delete[log_entry.partition_id].append( log_entry.delete_hash) es_docs[log_entry.partition_id].append(log_entry.es_doc()) if ids_to_delete: query = DBSession.query(Log).filter( Log.log_id.in_(ids_to_delete)) query.delete(synchronize_session=False) if es_docs_to_delete: # batch this to avoid problems with default ES bulk limits for es_index in es_docs_to_delete.keys(): for batch in in_batches(es_docs_to_delete[es_index], 20): query = {"query": {"terms": {"delete_hash": batch}}} try: Datastores.es.delete_by_query( index=es_index, doc_type="log", body=query, conflicts="proceed", ) except elasticsearch.exceptions.NotFoundError as exc: msg = "skipping index {}".format(es_index) log.info(msg) total_logs = len(dataset) log_msg = "LOG_NEW: %s, entries: %s, proto:%s" % ( str(resource), total_logs, proto_version, ) log.info(log_msg) # mark_changed(session) redis_pipeline = Datastores.redis.pipeline(transaction=False) key = REDIS_KEYS["counters"]["logs_per_minute"].format(current_time) redis_pipeline.incr(key, total_logs) redis_pipeline.expire(key, 3600 * 24) key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format( resource.owner_user_id, current_time) redis_pipeline.incr(key, total_logs) redis_pipeline.expire(key, 3600) key = REDIS_KEYS["counters"]["logs_per_hour_per_app"].format( resource_id, current_time.replace(minute=0)) redis_pipeline.incr(key, total_logs) redis_pipeline.expire(key, 3600 * 24 * 7) redis_pipeline.sadd( REDIS_KEYS["apps_that_got_new_data_per_hour"].format( current_time.replace(minute=0)), resource_id, ) redis_pipeline.execute() add_logs_es(es_docs) return True except Exception as exc: print_traceback(log) if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]: raise add_logs.retry(exc=exc)