def test_remove_failure_bulk(self): """ the remove operation should fail """ doc_id = 'test_id' error = {'delete': { 'status': 500, '_index': 'test_index', '_version': 1, 'found': True, '_id': doc_id }} with patch('search.elastic.bulk', side_effect=BulkIndexError('Simulated error', [error])): with self.assertRaises(BulkIndexError): self.searcher.remove(["test_id"])
def flush(self): if len(self.buffer) > 0: results = helpers.parallel_bulk(client=self.client, actions=self.buffer, index=self.index_name, doc_type=self.experiment_name) errors = [status for success, status in results if not success] if errors: raise BulkIndexError( "{} document(s) failed to index.".format(len(errors)), errors) self.buffer.clear()
async def _process_bulk_chunk(client, bulk_actions, bulk_data, raise_on_exception=True, raise_on_error=True, *args, **kwargs): """ Send a bulk request to elasticsearch and process the output. """ # if raise on error is set, we need to collect errors per chunk before raising them errors = [] try: # send the actual request resp = await client.bulk('\n'.join(bulk_actions) + '\n', *args, **kwargs) except TransportError as e: # default behavior - just propagate exception if raise_on_exception: raise e # if we are not propagating, mark all actions in current chunk as failed err_message = str(e) exc_errors = [] for data in bulk_data: # collect all the information about failed actions op_type, action = data[0].copy().popitem() info = { "error": err_message, "status": e.status_code, "exception": e } if op_type != 'delete': info['data'] = data[1] info.update(action) exc_errors.append({op_type: info}) # emulate standard behavior for failed actions if raise_on_error: raise BulkIndexError( '%i document(s) failed to index.' % len(exc_errors), exc_errors) else: for err in exc_errors: yield False, err return # go through request-reponse pairs and detect failures for data, (op_type, item) in zip(bulk_data, map(methodcaller('popitem'), resp['items'])): ok = 200 <= item.get('status', 500) < 300 if not ok and raise_on_error: # include original document source if len(data) > 1: item['data'] = data[1] errors.append({op_type: item}) if ok or not errors: # if we are not just recording all errors to be able to raise # them all at once, yield items individually yield ok, {op_type: item} if errors: raise BulkIndexError('%i document(s) failed to index.' % len(errors), errors)
def add_events( self, company_id, events, worker, allow_locked_tasks=False ) -> Tuple[int, int, dict]: actions: List[dict] = [] task_ids = set() task_iteration = defaultdict(lambda: 0) task_last_scalar_events = nested_dict( 3, dict ) # task_id -> metric_hash -> variant_hash -> MetricEvent task_last_events = nested_dict( 3, dict ) # task_id -> metric_hash -> event_type -> MetricEvent errors_per_type = defaultdict(int) invalid_iteration_error = f"Iteration number should not exceed {MAX_LONG}" valid_tasks = self._get_valid_tasks( company_id, task_ids={ event["task"] for event in events if event.get("task") is not None }, allow_locked_tasks=allow_locked_tasks, ) for event in events: # remove spaces from event type event_type = event.get("type") if event_type is None: errors_per_type["Event must have a 'type' field"] += 1 continue event_type = event_type.replace(" ", "_") if event_type not in EVENT_TYPES: errors_per_type[f"Invalid event type {event_type}"] += 1 continue task_id = event.get("task") if task_id is None: errors_per_type["Event must have a 'task' field"] += 1 continue if task_id not in valid_tasks: errors_per_type["Invalid task id"] += 1 continue event["type"] = event_type # @timestamp indicates the time the event is written, not when it happened event["@timestamp"] = es_factory.get_es_timestamp_str() # for backward bomba-tavili-tea if "ts" in event: event["timestamp"] = event.pop("ts") # set timestamp and worker if not sent if "timestamp" not in event: event["timestamp"] = es_factory.get_timestamp_millis() if "worker" not in event: event["worker"] = worker # force iter to be a long int iter = event.get("iter") if iter is not None: iter = int(iter) if iter > MAX_LONG or iter < MIN_LONG: errors_per_type[invalid_iteration_error] += 1 continue event["iter"] = iter # used to have "values" to indicate array. no need anymore if "values" in event: event["value"] = event["values"] del event["values"] event["metric"] = event.get("metric") or "" event["variant"] = event.get("variant") or "" index_name = get_index_name(company_id, event_type) es_action = { "_op_type": "index", # overwrite if exists with same ID "_index": index_name, "_source": event, } # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten) if event_type != EventType.task_log.value: es_action["_id"] = self._get_event_id(event) else: es_action["_id"] = dbutils.id() task_ids.add(task_id) if ( iter is not None and event.get("metric") not in self._skip_iteration_for_metric ): task_iteration[task_id] = max(iter, task_iteration[task_id]) self._update_last_metric_events_for_task( last_events=task_last_events[task_id], event=event, ) if event_type == EventType.metrics_scalar.value: self._update_last_scalar_events_for_task( last_events=task_last_scalar_events[task_id], event=event ) actions.append(es_action) plot_actions = [ action["_source"] for action in actions if action["_source"]["type"] == EventType.metrics_plot.value ] if plot_actions: self.validate_and_compress_plots( plot_actions, validate_json=config.get("services.events.validate_plot_str", False), compression_threshold=config.get( "services.events.plot_compression_threshold", 100_000 ), ) added = 0 with translate_errors_context(): if actions: chunk_size = 500 with TimingContext("es", "events_add_batch"): # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed with closing( helpers.streaming_bulk( self.es, actions, chunk_size=chunk_size, # thread_count=8, refresh=True, ) ) as it: for success, info in it: if success: added += 1 else: errors_per_type["Error when indexing events batch"] += 1 remaining_tasks = set() now = datetime.utcnow() for task_id in task_ids: # Update related tasks. For reasons of performance, we prefer to update # all of them and not only those who's events were successful updated = self._update_task( company_id=company_id, task_id=task_id, now=now, iter_max=task_iteration.get(task_id), last_scalar_events=task_last_scalar_events.get(task_id), last_events=task_last_events.get(task_id), ) if not updated: remaining_tasks.add(task_id) continue if remaining_tasks: TaskBLL.set_last_update( remaining_tasks, company_id, last_update=now ) # this is for backwards compatibility with streaming bulk throwing exception on those invalid_iterations_count = errors_per_type.get(invalid_iteration_error) if invalid_iterations_count: raise BulkIndexError( f"{invalid_iterations_count} document(s) failed to index.", [invalid_iteration_error], ) if not added: raise errors.bad_request.EventsNotAdded(**errors_per_type) errors_count = sum(errors_per_type.values()) return added, errors_count, errors_per_type