示例#1
0
    def log_queue_metrics_to_es(self, company_id: str,
                                queues: Sequence[Queue]) -> bool:
        """
        Calculate and write queue statistics (avg waiting time and queue length) to Elastic
        :return: True if the write to es was successful, false otherwise
        """
        es_index = (self._queue_metrics_prefix_for_company(company_id) +
                    self._get_es_index_suffix())

        timestamp = es_factory.get_timestamp_millis()

        def make_doc(queue: Queue) -> dict:
            entries = [e for e in queue.entries if e.added]
            return dict(
                _index=es_index,
                _source={
                    self.EsKeys.TIMESTAMP_FIELD:
                    timestamp,
                    self.EsKeys.QUEUE_FIELD:
                    queue.id,
                    self.EsKeys.WAITING_TIME_FIELD:
                    self._calc_avg_waiting_time(entries),
                    self.EsKeys.QUEUE_LENGTH_FIELD:
                    len(entries),
                },
            )

        actions = list(map(make_doc, queues))

        es_res = elasticsearch.helpers.bulk(self.es, actions)
        added, errors = es_res[:2]
        return (added == len(actions)) and not errors
示例#2
0
    def test_task_logs(self):
        task = self._temp_task()
        timestamp = es_factory.get_timestamp_millis()
        events = [
            self._create_task_event(
                "log",
                task=task,
                iteration=iter_,
                timestamp=timestamp + iter_ * 1000,
                msg=f"This is a log message from test task iter {iter_}",
            )
            for iter_ in range(10)
        ]
        self.send_batch(events)

        # test forward navigation
        ftime, ltime = None, None
        for page in range(2):
            ftime, ltime = self._assert_log_events(
                task=task, timestamp=ltime, expected_page=page
            )

        # test backwards navigation
        self._assert_log_events(task=task, timestamp=ftime, navigate_earlier=False)

        # test order
        self._assert_log_events(task=task, order="asc")
示例#3
0
 def _create_task_event(type_, task, iteration, **kwargs):
     return {
         "worker": "test",
         "type": type_,
         "task": task,
         "iter": iteration,
         "timestamp": kwargs.get("timestamp") or es_factory.get_timestamp_millis(),
         **kwargs,
     }
示例#4
0
 def create_event(self, task, type_, iteration, **kwargs) -> dict:
     return {
         "worker": "test",
         "type": type_,
         "task": task,
         "iter": iteration,
         "timestamp": es_factory.get_timestamp_millis(),
         "metric": "Metric1",
         "variant": "Variant1",
         **kwargs,
     }
示例#5
0
    def add_events(
        self, company_id, events, worker, allow_locked_tasks=False
    ) -> Tuple[int, int, dict]:
        actions: List[dict] = []
        task_ids = set()
        task_iteration = defaultdict(lambda: 0)
        task_last_scalar_events = nested_dict(
            3, dict
        )  # task_id -> metric_hash -> variant_hash -> MetricEvent
        task_last_events = nested_dict(
            3, dict
        )  # task_id -> metric_hash -> event_type -> MetricEvent
        errors_per_type = defaultdict(int)
        invalid_iteration_error = f"Iteration number should not exceed {MAX_LONG}"
        valid_tasks = self._get_valid_tasks(
            company_id,
            task_ids={
                event["task"] for event in events if event.get("task") is not None
            },
            allow_locked_tasks=allow_locked_tasks,
        )

        for event in events:
            # remove spaces from event type
            event_type = event.get("type")
            if event_type is None:
                errors_per_type["Event must have a 'type' field"] += 1
                continue

            event_type = event_type.replace(" ", "_")
            if event_type not in EVENT_TYPES:
                errors_per_type[f"Invalid event type {event_type}"] += 1
                continue

            task_id = event.get("task")
            if task_id is None:
                errors_per_type["Event must have a 'task' field"] += 1
                continue

            if task_id not in valid_tasks:
                errors_per_type["Invalid task id"] += 1
                continue

            event["type"] = event_type

            # @timestamp indicates the time the event is written, not when it happened
            event["@timestamp"] = es_factory.get_es_timestamp_str()

            # for backward bomba-tavili-tea
            if "ts" in event:
                event["timestamp"] = event.pop("ts")

            # set timestamp and worker if not sent
            if "timestamp" not in event:
                event["timestamp"] = es_factory.get_timestamp_millis()

            if "worker" not in event:
                event["worker"] = worker

            # force iter to be a long int
            iter = event.get("iter")
            if iter is not None:
                iter = int(iter)
                if iter > MAX_LONG or iter < MIN_LONG:
                    errors_per_type[invalid_iteration_error] += 1
                    continue
                event["iter"] = iter

            # used to have "values" to indicate array. no need anymore
            if "values" in event:
                event["value"] = event["values"]
                del event["values"]

            event["metric"] = event.get("metric") or ""
            event["variant"] = event.get("variant") or ""

            index_name = get_index_name(company_id, event_type)
            es_action = {
                "_op_type": "index",  # overwrite if exists with same ID
                "_index": index_name,
                "_source": event,
            }

            # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten)
            if event_type != EventType.task_log.value:
                es_action["_id"] = self._get_event_id(event)
            else:
                es_action["_id"] = dbutils.id()

            task_ids.add(task_id)
            if (
                iter is not None
                and event.get("metric") not in self._skip_iteration_for_metric
            ):
                task_iteration[task_id] = max(iter, task_iteration[task_id])

            self._update_last_metric_events_for_task(
                last_events=task_last_events[task_id], event=event,
            )
            if event_type == EventType.metrics_scalar.value:
                self._update_last_scalar_events_for_task(
                    last_events=task_last_scalar_events[task_id], event=event
                )

            actions.append(es_action)

        plot_actions = [
            action["_source"]
            for action in actions
            if action["_source"]["type"] == EventType.metrics_plot.value
        ]
        if plot_actions:
            self.validate_and_compress_plots(
                plot_actions,
                validate_json=config.get("services.events.validate_plot_str", False),
                compression_threshold=config.get(
                    "services.events.plot_compression_threshold", 100_000
                ),
            )

        added = 0
        with translate_errors_context():
            if actions:
                chunk_size = 500
                with TimingContext("es", "events_add_batch"):
                    # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed
                    with closing(
                        helpers.streaming_bulk(
                            self.es,
                            actions,
                            chunk_size=chunk_size,
                            # thread_count=8,
                            refresh=True,
                        )
                    ) as it:
                        for success, info in it:
                            if success:
                                added += 1
                            else:
                                errors_per_type["Error when indexing events batch"] += 1

                    remaining_tasks = set()
                    now = datetime.utcnow()
                    for task_id in task_ids:
                        # Update related tasks. For reasons of performance, we prefer to update
                        # all of them and not only those who's events were successful
                        updated = self._update_task(
                            company_id=company_id,
                            task_id=task_id,
                            now=now,
                            iter_max=task_iteration.get(task_id),
                            last_scalar_events=task_last_scalar_events.get(task_id),
                            last_events=task_last_events.get(task_id),
                        )

                        if not updated:
                            remaining_tasks.add(task_id)
                            continue

                    if remaining_tasks:
                        TaskBLL.set_last_update(
                            remaining_tasks, company_id, last_update=now
                        )

            # this is for backwards compatibility with streaming bulk throwing exception on those
            invalid_iterations_count = errors_per_type.get(invalid_iteration_error)
            if invalid_iterations_count:
                raise BulkIndexError(
                    f"{invalid_iterations_count} document(s) failed to index.",
                    [invalid_iteration_error],
                )

        if not added:
            raise errors.bad_request.EventsNotAdded(**errors_per_type)

        errors_count = sum(errors_per_type.values())
        return added, errors_count, errors_per_type