def get_scalar_metrics_average_per_iter( self, company_id: str, task_id: str, samples: int, key: ScalarKeyEnum, metric_variants: MetricVariants = None, ) -> dict: """ Get scalar metric histogram per metric and variant The amount of points in each histogram should not exceed the requested samples """ event_type = EventType.metrics_scalar if check_empty_data(self.es, company_id=company_id, event_type=event_type): return {} return self._get_scalar_average_per_iter_core( task_id=task_id, company_id=company_id, event_type=event_type, samples=samples, key=ScalarKey.resolve(key), metric_variants=metric_variants, )
def get_vector_metrics_per_iter(self, company_id, task_id, metric, variant): event_type = EventType.metrics_vector if check_empty_data(self.es, company_id=company_id, event_type=event_type): return [], [] es_req = { "size": 10000, "query": { "bool": { "must": [ {"term": {"task": task_id}}, {"term": {"metric": metric}}, {"term": {"variant": variant}}, ] } }, "_source": ["iter", "value"], "sort": ["iter"], } with translate_errors_context(), TimingContext("es", "task_stats_vector"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req ) vectors = [] iterations = [] for hit in es_res["hits"]["hits"]: vectors.append(hit["_source"]["value"]) iterations.append(hit["_source"]["iter"]) return iterations, vectors
def get_last_iters( self, company_id: str, event_type: EventType, task_id: str, iters: int ): if check_empty_data(self.es, company_id=company_id, event_type=event_type): return [] es_req: dict = { "size": 0, "aggs": { "iters": { "terms": { "field": "iter", "size": iters, "order": {"_key": "desc"}, } } }, "query": {"bool": {"must": [{"term": {"task": task_id}}]}}, } with translate_errors_context(), TimingContext("es", "task_last_iter"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req ) if "aggregations" not in es_res: return [] return [b["key"] for b in es_res["aggregations"]["iters"]["buckets"]]
def get_plot_image_urls( self, company_id: str, task_id: str, scroll_id: Optional[str] ) -> Tuple[Sequence[dict], Optional[str]]: if scroll_id == self.empty_scroll: return [], None if scroll_id: es_res = self.es.scroll(scroll_id=scroll_id, scroll="10m") else: if check_empty_data(self.es, company_id, EventType.metrics_plot): return [], None es_req = { "size": 1000, "_source": [PlotFields.source_urls], "query": { "bool": { "must": [ {"term": {"task": task_id}}, {"exists": {"field": PlotFields.source_urls}}, ] } }, } es_res = search_company_events( self.es, company_id=company_id, event_type=EventType.metrics_plot, body=es_req, scroll="10m", ) events, _, next_scroll_id = self._get_events_from_es_res(es_res) return events, next_scroll_id
def get_task_events( self, event_type: EventType, company_id: str, task_id: str, batch_size: int, navigate_earlier: bool = True, from_key_value: Optional[Any] = None, metric_variants: MetricVariants = None, key: ScalarKeyEnum = ScalarKeyEnum.timestamp, **kwargs, ) -> TaskEventsResult: if check_empty_data(self.es, company_id, event_type): return TaskEventsResult() from_key_value = kwargs.pop("from_timestamp", from_key_value) res = TaskEventsResult() res.events, res.total_events = self._get_events( event_type=event_type, company_id=company_id, task_id=task_id, batch_size=batch_size, navigate_earlier=navigate_earlier, from_key_value=from_key_value, metric_variants=metric_variants, key=ScalarKey.resolve(key), ) return res
def get_last_iterations_per_event_metric_variant( self, company_id: str, task_id: str, num_last_iterations: int, event_type: EventType, ): if check_empty_data(self.es, company_id=company_id, event_type=event_type): return [] es_req: dict = { "size": 0, "aggs": { "metrics": { "terms": { "field": "metric", "size": EventSettings.max_metrics_count, "order": {"_key": "asc"}, }, "aggs": { "variants": { "terms": { "field": "variant", "size": EventSettings.max_variants_count, "order": {"_key": "asc"}, }, "aggs": { "iters": { "terms": { "field": "iter", "size": num_last_iterations, "order": {"_key": "desc"}, } } }, } }, } }, "query": {"bool": {"must": [{"term": {"task": task_id}}]}}, } with translate_errors_context(), TimingContext( "es", "task_last_iter_metric_variant" ): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req ) if "aggregations" not in es_res: return [] return [ (metric["key"], variant["key"], iter["key"]) for metric in es_res["aggregations"]["metrics"]["buckets"] for variant in metric["variants"]["buckets"] for iter in variant["iters"]["buckets"] ]
def get_metrics_and_variants(self, company_id: str, task_id: str, event_type: EventType): if check_empty_data(self.es, company_id=company_id, event_type=event_type): return {} es_req = { "size": 0, "aggs": { "metrics": { "terms": { "field": "metric", "size": EventSettings.max_metrics_count, "order": { "_key": "asc" }, }, "aggs": { "variants": { "terms": { "field": "variant", "size": EventSettings.max_variants_count, "order": { "_key": "asc" }, } } }, } }, "query": { "bool": { "must": [{ "term": { "task": task_id } }] } }, } with translate_errors_context(), TimingContext( "es", "events_get_metrics_and_variants"): es_res = search_company_events(self.es, company_id=company_id, event_type=event_type, body=es_req) metrics = {} for metric_bucket in es_res["aggregations"]["metrics"].get("buckets"): metric = metric_bucket["key"] metrics[metric] = [ b["key"] for b in metric_bucket["variants"].get("buckets") ] return metrics
def scroll_task_events( self, company_id: str, task_id: str, order: str, event_type: EventType, batch_size=10000, scroll_id=None, ): if scroll_id == self.empty_scroll: return [], scroll_id, 0 if scroll_id: with translate_errors_context(), TimingContext( "es", "task_log_events"): es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h") else: size = min(batch_size, 10000) if check_empty_data(self.es, company_id=company_id, event_type=event_type): return [], None, 0 es_req = { "size": size, "sort": { "timestamp": { "order": order } }, "query": { "bool": { "must": [{ "term": { "task": task_id } }] } }, } with translate_errors_context(), TimingContext( "es", "scroll_task_events"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req, scroll="1h", ) events, total_events, next_scroll_id = self._get_events_from_es_res( es_res) return events, next_scroll_id, total_events
def get_task_events( self, company_id: str, metrics: Sequence[Tuple[str, str]], iter_count: int, navigate_earlier: bool = True, refresh: bool = False, state_id: str = None, ) -> DebugImagesResult: if check_empty_data(self.es, company_id, self.EVENT_TYPE): return DebugImagesResult() def init_state(state_: DebugImageEventsScrollState): unique_metrics = set(metrics) state_.metrics = self._init_metric_states(company_id, list(unique_metrics)) def validate_state(state_: DebugImageEventsScrollState): """ Validate that the metrics stored in the state are the same as requested in the current call. Refresh the state if requested """ state_metrics = set((m.task, m.name) for m in state_.metrics) if state_metrics != set(metrics): raise errors.bad_request.InvalidScrollId( "Task metrics stored in the state do not match the passed ones", scroll_id=state_.id, ) if refresh: self._reinit_outdated_metric_states(company_id, state_) for metric_state in state_.metrics: metric_state.reset() with self.cache_manager.get_or_create_state( state_id=state_id, init_state=init_state, validate_state=validate_state) as state: res = DebugImagesResult(next_scroll_id=state.id) with ThreadPoolExecutor(EventSettings.max_workers) as pool: res.metric_events = list( pool.map( partial( self._get_task_metric_events, company_id=company_id, iter_count=iter_count, navigate_earlier=navigate_earlier, ), state.metrics, )) return res
def get_task_events( self, company_id: str, task_metrics: Mapping[str, dict], iter_count: int, navigate_earlier: bool = True, refresh: bool = False, state_id: str = None, ) -> DebugImagesResult: if check_empty_data(self.es, company_id, self.EVENT_TYPE): return DebugImagesResult() def init_state(state_: DebugImageEventsScrollState): state_.tasks = self._init_task_states(company_id, task_metrics) def validate_state(state_: DebugImageEventsScrollState): """ Validate that the metrics stored in the state are the same as requested in the current call. Refresh the state if requested """ if refresh: self._reinit_outdated_task_states(company_id, state_, task_metrics) with self.cache_manager.get_or_create_state( state_id=state_id, init_state=init_state, validate_state=validate_state) as state: res = DebugImagesResult(next_scroll_id=state.id) with ThreadPoolExecutor(EventSettings.max_workers) as pool: res.metric_events = list( pool.map( partial( self._get_task_metric_events, company_id=company_id, iter_count=iter_count, navigate_earlier=navigate_earlier, ), state.tasks, )) return res
def get_tasks_metrics(self, company_id, task_ids: Sequence, event_type: EventType) -> Sequence: """ For the requested tasks return all the metrics that reported events of the requested types """ if check_empty_data(self.es, company_id, event_type): return {} with ThreadPoolExecutor(EventSettings.max_workers) as pool: res = pool.map( partial( self._get_task_metrics, company_id=company_id, event_type=event_type, ), task_ids, ) return list(zip(task_ids, res))
def get_last_iters( self, company_id: str, event_type: EventType, task_id: Union[str, Sequence[str]], iters: int, ) -> Mapping[str, Sequence]: if check_empty_data(self.es, company_id=company_id, event_type=event_type): return {} task_ids = [task_id] if isinstance(task_id, str) else task_id es_req: dict = { "size": 0, "aggs": { "tasks": { "terms": {"field": "task"}, "aggs": { "iters": { "terms": { "field": "iter", "size": iters, "order": {"_key": "desc"}, } } }, } }, "query": {"bool": {"must": [{"terms": {"task": task_ids}}]}}, } with translate_errors_context(), TimingContext("es", "task_last_iter"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req, ) if "aggregations" not in es_res: return {} return { tb["key"]: [ib["key"] for ib in tb["iters"]["buckets"]] for tb in es_res["aggregations"]["tasks"]["buckets"] }
def get_task_events( self, company_id: str, task_id: str, batch_size: int, navigate_earlier: bool = True, from_timestamp: Optional[int] = None, ) -> TaskEventsResult: if check_empty_data(self.es, company_id, self.EVENT_TYPE): return TaskEventsResult() res = TaskEventsResult() res.events, res.total_events = self._get_events( company_id=company_id, task_id=task_id, batch_size=batch_size, navigate_earlier=navigate_earlier, from_timestamp=from_timestamp, ) return res
def get_task_latest_scalar_values( self, company_id, task_id ) -> Tuple[Sequence[dict], int]: event_type = EventType.metrics_scalar if check_empty_data(self.es, company_id=company_id, event_type=event_type): return [], 0 query = { "bool": { "must": [ {"query_string": {"query": "value:>0"}}, {"term": {"task": task_id}}, ] } } es_req = { "size": 0, "query": query, "aggs": { "metrics": { "terms": { "field": "metric", "size": EventSettings.max_metrics_count, "order": {"_key": "asc"}, }, "aggs": { "variants": { "terms": { "field": "variant", "size": EventSettings.max_variants_count, "order": {"_key": "asc"}, }, "aggs": { "last_value": { "top_hits": { "docvalue_fields": ["value"], "_source": "value", "size": 1, "sort": [{"iter": {"order": "desc"}}], } }, "last_timestamp": {"max": {"field": "@timestamp"}}, "last_10_value": { "top_hits": { "docvalue_fields": ["value"], "_source": "value", "size": 10, "sort": [{"iter": {"order": "desc"}}], } }, }, } }, } }, "_source": {"excludes": []}, } with translate_errors_context(), TimingContext( "es", "events_get_metrics_and_variants" ): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req ) metrics = [] max_timestamp = 0 for metric_bucket in es_res["aggregations"]["metrics"].get("buckets"): metric_summary = dict(name=metric_bucket["key"], variants=[]) for variant_bucket in metric_bucket["variants"].get("buckets"): variant_name = variant_bucket["key"] last_value = variant_bucket["last_value"]["hits"]["hits"][0]["fields"][ "value" ][0] last_10_value = variant_bucket["last_10_value"]["hits"]["hits"][0][ "fields" ]["value"][0] timestamp = variant_bucket["last_timestamp"]["value"] max_timestamp = max(timestamp, max_timestamp) metric_summary["variants"].append( dict( name=variant_name, last_value=last_value, last_10_value=last_10_value, ) ) metrics.append(metric_summary) return metrics, max_timestamp
def get_task_events( self, company_id: str, task_id: str, event_type: EventType, metric=None, variant=None, last_iter_count=None, sort=None, size=500, scroll_id=None, no_scroll=False, ) -> TaskEventsResult: if scroll_id == self.empty_scroll: return TaskEventsResult() if scroll_id: with translate_errors_context(), TimingContext("es", "get_task_events"): es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h") else: if check_empty_data(self.es, company_id=company_id, event_type=event_type): return TaskEventsResult() task_ids = [task_id] if isinstance(task_id, str) else task_id must = [] if metric: must.append({"term": {"metric": metric}}) if variant: must.append({"term": {"variant": variant}}) if last_iter_count is None: must.append({"terms": {"task": task_ids}}) else: tasks_iters = self.get_last_iters( company_id=company_id, event_type=event_type, task_id=task_ids, iters=last_iter_count, ) should = [ { "bool": { "must": [ {"term": {"task": task}}, {"terms": {"iter": last_iters}}, ] } } for task, last_iters in tasks_iters.items() if last_iters ] if not should: return TaskEventsResult() must.append({"bool": {"should": should}}) if sort is None: sort = [{"timestamp": {"order": "asc"}}] es_req = { "sort": sort, "size": min(size, 10000), "query": {"bool": {"must": must}}, } with translate_errors_context(), TimingContext("es", "get_task_events"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req, ignore=404, **({} if no_scroll else {"scroll": "1h"}), ) events, total_events, next_scroll_id = self._get_events_from_es_res(es_res) if event_type in (EventType.metrics_plot, EventType.all): self.uncompress_plots(events) return TaskEventsResult( events=events, next_scroll_id=next_scroll_id, total_events=total_events )
def get_task_events( self, company_id: str, task_id: str, event_type: EventType, metric=None, variant=None, last_iter_count=None, sort=None, size=500, scroll_id=None, ): if scroll_id == self.empty_scroll: return [], scroll_id, 0 if scroll_id: with translate_errors_context(), TimingContext( "es", "get_task_events"): es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h") else: task_ids = [task_id] if isinstance(task_id, six.string_types) else task_id if check_empty_data(self.es, company_id=company_id, event_type=event_type): return TaskEventsResult() must = [] if metric: must.append({"term": {"metric": metric}}) if variant: must.append({"term": {"variant": variant}}) if last_iter_count is None: must.append({"terms": {"task": task_ids}}) else: should = [] for i, task_id in enumerate(task_ids): last_iters = self.get_last_iters( company_id=company_id, event_type=event_type, task_id=task_id, iters=last_iter_count, ) if not last_iters: continue should.append({ "bool": { "must": [ { "term": { "task": task_id } }, { "terms": { "iter": last_iters } }, ] } }) if not should: return TaskEventsResult() must.append({"bool": {"should": should}}) if sort is None: sort = [{"timestamp": {"order": "asc"}}] es_req = { "sort": sort, "size": min(size, 10000), "query": { "bool": { "must": must } }, } with translate_errors_context(), TimingContext( "es", "get_task_events"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req, ignore=404, scroll="1h", ) events, total_events, next_scroll_id = self._get_events_from_es_res( es_res) return TaskEventsResult(events=events, next_scroll_id=next_scroll_id, total_events=total_events)
def compare_scalar_metrics_average_per_iter( self, company_id, task_ids: Sequence[str], samples, key: ScalarKeyEnum, allow_public=True, ): """ Compare scalar metrics for different tasks per metric and variant The amount of points in each histogram should not exceed the requested samples """ task_name_by_id = {} with translate_errors_context(): task_objs = Task.get_many( company=company_id, query=Q(id__in=task_ids), allow_public=allow_public, override_projection=("id", "name", "company", "company_origin"), return_dicts=False, ) if len(task_objs) < len(task_ids): invalid = tuple(set(task_ids) - set(r.id for r in task_objs)) raise errors.bad_request.InvalidTaskId(company=company_id, ids=invalid) task_name_by_id = {t.id: t.name for t in task_objs} companies = {t.get_index_company() for t in task_objs} if len(companies) > 1: raise errors.bad_request.InvalidTaskId( "only tasks from the same company are supported") event_type = EventType.metrics_scalar company_id = next(iter(companies)) if check_empty_data(self.es, company_id=company_id, event_type=event_type): return {} get_scalar_average_per_iter = partial( self._get_scalar_average_per_iter_core, company_id=company_id, event_type=event_type, samples=samples, key=ScalarKey.resolve(key), run_parallel=False, ) with ThreadPoolExecutor(max_workers=EventSettings.max_workers) as pool: task_metrics = zip(task_ids, pool.map(get_scalar_average_per_iter, task_ids)) res = defaultdict(lambda: defaultdict(dict)) for task_id, task_data in task_metrics: task_name = task_name_by_id[task_id] for metric_key, metric_data in task_data.items(): for variant_key, variant_data in metric_data.items(): variant_data["name"] = task_name res[metric_key][variant_key][task_id] = variant_data return res
def get_task_plots( self, company_id: str, tasks: Sequence[str], last_iterations_per_plot: int = None, sort=None, size: int = 500, scroll_id: str = None, no_scroll: bool = False, metric_variants: MetricVariants = None, ): if scroll_id == self.empty_scroll: return TaskEventsResult() if scroll_id: with translate_errors_context(), TimingContext("es", "get_task_events"): es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h") else: event_type = EventType.metrics_plot if check_empty_data(self.es, company_id=company_id, event_type=event_type): return TaskEventsResult() plot_valid_condition = { "bool": { "should": [ {"term": {PlotFields.valid_plot: True}}, { "bool": { "must_not": {"exists": {"field": PlotFields.valid_plot}} } }, ] } } must = [plot_valid_condition] if last_iterations_per_plot is None: must.append({"terms": {"task": tasks}}) if metric_variants: must.append(get_metric_variants_condition(metric_variants)) else: should = [] for i, task_id in enumerate(tasks): last_iters = self.get_last_iterations_per_event_metric_variant( company_id=company_id, task_id=task_id, num_last_iterations=last_iterations_per_plot, event_type=event_type, metric_variants=metric_variants, ) if not last_iters: continue for metric, variant, iter in last_iters: should.append( { "bool": { "must": [ {"term": {"task": task_id}}, {"term": {"metric": metric}}, {"term": {"variant": variant}}, {"term": {"iter": iter}}, ] } } ) if not should: return TaskEventsResult() must.append({"bool": {"should": should}}) if sort is None: sort = [{"timestamp": {"order": "asc"}}] es_req = { "sort": sort, "size": min(size, 10000), "query": {"bool": {"must": must}}, } with translate_errors_context(), TimingContext("es", "get_task_plots"): es_res = search_company_events( self.es, company_id=company_id, event_type=event_type, body=es_req, ignore=404, **({} if no_scroll else {"scroll": "1h"}), ) events, total_events, next_scroll_id = self._get_events_from_es_res(es_res) self.uncompress_plots(events) return TaskEventsResult( events=events, next_scroll_id=next_scroll_id, total_events=total_events )