def test_zerofill(self): results = zerofill({}, datetime(2019, 1, 2, 0, 0), datetime(2019, 1, 9, 23, 59, 59), 86400, "time") results_desc = zerofill({}, datetime(2019, 1, 2, 0, 0), datetime(2019, 1, 9, 23, 59, 59), 86400, "-time") assert results == list(reversed(results_desc)) # Bucket for the 2, 3, 4, 5, 6, 7, 8, 9 assert len(results) == 8 assert results[0]["time"] == 1546387200 assert results[7]["time"] == 1546992000
def calculate_incident_start(query, projects, groups): """ Attempts to automatically calculate the date that an incident began at based on the events related to the incident. """ params = {} if groups: params["issue.id"] = [g.id for g in groups] end = max(g.last_seen for g in groups) + timedelta(seconds=1) else: end = timezone.now() params["start"] = end - INCIDENT_START_PERIOD params["end"] = end if projects: params["project_id"] = [p.id for p in projects] filter = get_filter(query, params) rollup = int(INCIDENT_START_ROLLUP.total_seconds()) result = raw_query( aggregations=[("count()", "", "count"), ("min", "timestamp", "first_seen")], orderby="time", groupby=["time"], rollup=rollup, referrer="incidents.calculate_incident_start", limit=10000, start=filter.start, end=filter.end, conditions=filter.conditions, filter_keys=filter.filter_keys, )["data"] # TODO: Start could be the period before the first period we find result = zerofill(result, params["start"], params["end"], rollup, "time") # We want to linearly scale scores from 100% value at the most recent to # 50% at the oldest. This gives a bias towards newer results. negative_weight = (1.0 / len(result)) / 2 multiplier = 1.0 cur_spike_max_count = -1 cur_spike_start = None cur_spike_end = None max_height = 0 incident_start = None cur_height = 0 prev_count = 0 def get_row_first_seen(row, default=None): first_seen = default if "first_seen" in row: first_seen = parse_date(row["first_seen"]).replace(tzinfo=pytz.utc) return first_seen def calculate_start(spike_start, spike_end): """ We arbitrarily choose a date about 1/3 into the incident period. We could potentially improve this if we want by analyzing the period in more detail and choosing a date that most closely fits with being 1/3 up the spike. """ spike_length = spike_end - spike_start return spike_start + (spike_length / 3) for row in reversed(result): cur_count = row.get("count", 0) if cur_count < prev_count or cur_count > 0 and cur_count == prev_count: cur_height = cur_spike_max_count - cur_count elif cur_count > 0 or prev_count > 0 or cur_height > 0: # Now we've got the height of the current spike, compare it to the # current max. We decrease the value by `multiplier` so that we # favour newer results cur_height *= multiplier if cur_height > max_height: # If we detect that we have a new highest peak, then set a new # incident start date incident_start = calculate_start(cur_spike_start, cur_spike_end) max_height = cur_height cur_height = 0 cur_spike_max_count = cur_count cur_spike_end = get_row_first_seen(row) # We attempt to get the first_seen value from the row here. If the row # doesn't have it (because it's a zerofilled row), then just use the # previous value. This allows us to have the start of a spike always be # a bucket that contains at least one element. cur_spike_start = get_row_first_seen(row, cur_spike_start) prev_count = cur_count multiplier -= negative_weight if (cur_height > max_height or not incident_start) and cur_spike_start: incident_start = calculate_start(cur_spike_start, cur_spike_end) if not incident_start: incident_start = timezone.now() return incident_start
def timeseries_query(selected_columns, query, params, rollup, reference_event=None, referrer=None): """ High-level API for doing arbitrary user timeseries queries against events. This function operates on the public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. This function is intended to only get timeseries based results and thus requires the `rollup` parameter. Returns a SnubaTSResult object that has been zerofilled in case of gaps. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment, rollup (int) The bucket width in seconds reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. """ snuba_filter = get_filter(query, params) snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, } if not snuba_args["start"] and not snuba_args["end"]: raise InvalidSearchQuery( "Cannot get timeseries result without a start and end.") snuba_args.update( resolve_field_list(selected_columns, snuba_args, auto_fields=False)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, _ = resolve_discover_aliases(snuba_args) if not snuba_args["aggregations"]: raise InvalidSearchQuery( "Cannot get timeseries result with no aggregation.") result = raw_query( aggregations=snuba_args.get("aggregations"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), start=snuba_args.get("start"), end=snuba_args.get("end"), rollup=rollup, orderby="time", groupby=["time"], dataset=Dataset.Discover, limit=10000, referrer=referrer, ) result = zerofill(result["data"], snuba_args["start"], snuba_args["end"], rollup, "time") return SnubaTSResult(result, snuba_filter.start, snuba_filter.end, rollup)