def get_deleted_award_ids(client: Elasticsearch, id_list: list, config: dict, index: Optional[str] = None) -> list: """ id_list = [{key:'key1',col:'transaction_id'}, {key:'key2',col:'generated_unique_transaction_id'}], ...] """ if index is None: index = f"{config['query_alias_prefix']}-*" col_to_items_dict = defaultdict(list) for l in id_list: col_to_items_dict[l["col"]].append(l["key"]) awards = [] for column, values in col_to_items_dict.items(): values_generator = chunks(values, 1000) for v in values_generator: body = filter_query(column, v) response = client.search(index=index, body=json.dumps(body), size=config["max_query_size"]) if response["hits"]["total"]["value"] != 0: awards = [ x["_source"]["generated_unique_award_id"] for x in response["hits"]["hits"] ] return awards
def delete_docs_by_unique_key(client: Elasticsearch, key: str, value_list: list, task_id: str, index) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of filed in targeted elasticearch index that shoudld have a unique value for every doc in the index. Ideally the field or sub-field provided is of ``keyword`` type. value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Returns: Number of ES documents deleted """ start = perf_counter() if len(value_list) == 0: logger.info( format_log("Nothing to delete", action="Delete", name=task_id)) return 0 logger.info( format_log(f"Deleting up to {len(value_list):,} document(s)", action="Delete", name=task_id)) if not index: raise RuntimeError("index name must be provided") deleted = 0 is_error = False try: # 65,536 is max number of terms that can be added to an ES terms filter query values_generator = chunks(value_list, 50000) for chunk_of_values in values_generator: # Creates an Elasticsearch query criteria for the _delete_by_query call q = ES_Q("terms", **{key: chunk_of_values}) # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query response = Search(using=client, index=index).filter(q).delete() chunk_deletes = response["deleted"] deleted += chunk_deletes except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) raise SystemExit(1) finally: error_text = " before encountering an error" if is_error else "" duration = perf_counter() - start docs = f"document{'s' if deleted != 1 else ''}" msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} {docs}{error_text}" logger.info(format_log(msg, action="Delete", name=task_id)) return deleted
def _lookup_deleted_award_ids(client: Elasticsearch, id_list: list, config: dict, index: Optional[str] = None) -> list: """Lookup deleted transactions to derive parent awards to be deleted This fetches a list of all unique award keys compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of any document in the transaction index that matches the query, which looks up deleted transaction ES documents by their ``ES_TRANSACTIONS_UNIQUE_KEY_FIELD`` field. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster id_list (list): A list of dictionaries, each having two keys, in this format:: id_list = [ {key:'<value_of_col>', col:'<unique_key_col_name>'}, {key:'<value_of_col>', col:'<unique_key_col_name>'}, ..., ] config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task index (str): Optional name, alias, or pattern of index this query will target. Looks up via config if not provided Returns: None """ if index is None: index = f"{config['query_alias_prefix']}-*" col_to_items_dict = defaultdict(list) for l in id_list: col_to_items_dict[l["col"]].append(l["key"]) awards = [] for column, values in col_to_items_dict.items(): values_generator = chunks(values, 1000) for v in values_generator: body = filter_query(column, v) response = client.search(index=index, body=json.dumps(body), size=config["max_query_size"]) if response["hits"]["total"]["value"] != 0: awards += [ x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD] for x in response["hits"]["hits"] ] return awards
def delete_from_es( client: Elasticsearch, id_list: List[dict], index: str, max_query_size: int, use_aliases: bool = False, task_id: Optional[Tuple[int, str]] = None, ) -> None: """ id_list = [ {key: 'key1', col: 'tranaction_id'}, {key: 'key2', col: 'generated_unique_transaction_id'}, ... ] - or - id_list = [ {key: 'key1', col: 'award_id'}, {key: 'key2', col: 'generated_unique_award_id'}, ... ] """ start = perf_counter() msg = f"Deleting up to {len(id_list):,} document{'s' if len(id_list) != 1 else ''}" logger.info(format_log(msg, name=task_id, action="Delete")) if use_aliases: index = f"{index}-*" start_ = client.count(index=index)["count"] logger.info( format_log(f"Starting amount of indices ----- {start_:,}", name=task_id, action="Delete")) col_to_items_dict = defaultdict(list) for l in id_list: col_to_items_dict[l["col"]].append(l["key"]) for column, values in col_to_items_dict.items(): logger.info( format_log(f"Deleting {len(values):,} of '{column}'", name=task_id, action="Delete")) values_generator = chunks(values, 1000) for v in values_generator: # IMPORTANT: This delete routine looks at just 1 index at a time. If there are duplicate records across # multiple indexes, those duplicates will not be caught by this routine. It is left as is because at the # time of this comment, we are migrating to using a single index. body = filter_query(column, v) response = client.search(index=index, body=json.dumps(body), size=max_query_size) delete_body = delete_query(response) try: client.delete_by_query(index=index, body=json.dumps(delete_body), refresh=True, size=max_query_size) except Exception: logger.exception(format_log("", name=task_id, action="Delete")) raise SystemExit(1) end_ = client.count(index=index)["count"] record_count = start_ - end_ duration = perf_counter() - start msg = f"Delete operation took {duration:.2f}s. Removed {record_count:,} document{'s' if record_count != 1 else ''}" logger.info(format_log(msg, name=task_id, action="Delete")) return
def delete_docs_by_unique_key( client: Elasticsearch, key: str, value_list: list, task_id: str, index, refresh_after: bool = True, delete_chunk_size: int = 1000, ) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of field in targeted elasticsearch index that should have a unique value for every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling code. NOTE: This param will be ignored and a refresh will be attempted if this function errors-out during execution, in order to not leave un-refreshed deletes in the index. delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally use ``config["partition_size"]`` (derived from --partition-size) to set this to a calibrated value. If not provided, uses 1000 as a safe default (10,000 resulted in some timeouts on a busy cluster). Returns: Number of ES documents deleted """ start = perf_counter() if len(value_list) == 0: logger.info( format_log("Nothing to delete", action="Delete", name=task_id)) return 0 logger.info( format_log(f"Deleting up to {len(value_list):,} document(s)", action="Delete", name=task_id)) if not index: raise RuntimeError("index name must be provided") if not _is_allowed_key_field_type(client, key, index): msg = ( f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of ' f"the allowed field types, or the field was not found in that index." ) logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if delete_chunk_size > 65536: # 65,536 is max number of terms that can be added to an ES terms filter query msg = ( f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " f"terms filter query") logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) chunks_processed = 0 deleted = 0 is_error = False try: values_generator = chunks(value_list, delete_chunk_size) for chunk_of_values in values_generator: # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query # _refresh is deferred until the end of chunk processing q = Search(using=client, index=index).filter("terms", **{key: chunk_of_values}) # type: Search # params: # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once # slices="auto": Will create parallel delete batches per shard q = q.params(conflicts="proceed", slices="auto") response = q.delete() # Some subtle errors come back on the response if response["timed_out"]: msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if response["failures"]: fail_snippet = "\n\t\t" + "\n\t\t".join( map(str, response["failures"][0:4])) + "\n\t\t" + "..." msg = f"Some docs failed to delete on cluster:{fail_snippet}" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) logger.info( format_log( f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} " f"in {int(response['took'])/1000:.2f}s, " f"and ignored {response['version_conflicts']:,} version conflicts", action="Delete", name=task_id, )) deleted += response["deleted"] chunks_processed += 1 except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) raise finally: if deleted > 0 and (refresh_after or is_error): if not is_error: refresh_msg = "Refreshing index so deletes take effect" else: refresh_msg = "Attempting index refresh while handling error so deletes take effect" logger.info(format_log(refresh_msg, action="Delete", name=task_id)) client.indices.refresh(index=index) if chunks_processed > 1 or is_error: # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error) error_text = " before encountering an error" if is_error else "" duration = perf_counter() - start docs = f"document{'s' if deleted != 1 else ''}" msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}" logger.info(format_log(msg, action="Delete", name=task_id)) return deleted
def _lookup_deleted_award_keys( client: Elasticsearch, lookup_key: str, value_list: list, config: dict, index: Optional[str] = None, lookup_chunk_size: int = 50000, ) -> list: """Derive a list of award keys given a target index, Lookup field, and lookup values This returns a list of all unique award keys, which are compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of any document in the given ``index`` that matches the query. The matching query is a terms query that will return the doc if its ``lookup_key`` field has any value provided in ``value_list``. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster lookup_key (str): name of field in targeted elasticsearch index by which we are looking up docs. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if lookup_key field has any of these values, the document will be returned from the lookup config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task index (str): Optional name, alias, or pattern of index this query will target. Looks up via config if not provided lookup_chunk_size (int): the batch-size of terms value-array to be looked-up. Needs to be less than 65536 (max values for any terms query), and less than config["max_query_size"] Returns: list of values for the ES_AWARDS_UNIQUE_KEY_FIELD fields in the looked-up documents. """ if index is None: index = f"{config['query_alias_prefix']}-*" if not _is_allowed_key_field_type(client, lookup_key, index): msg = ( f'Cannot perform lookups in index "{index}" with key field "{lookup_key}" because its type is not one of ' f"the allowed field types, or the field was not found in that index." ) logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) if lookup_chunk_size > 65536: # 65,536 is max number of terms that can be added to an ES terms filter query msg = ( f"{lookup_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " f"terms filter query") logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) if lookup_chunk_size > config["max_query_size"]: # Some keys would be left undiscovered if our chunk was cut short by the query only returning a lesser subset msg = ( f"{lookup_chunk_size} is greater {config['max_query_size']}, which is the max number of query " f"results returnable from this index. Use a smaller chunk or increase max_result_window for this index." ) logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) award_key_list = [] values_generator = chunks(value_list, lookup_chunk_size) for chunk_of_values in values_generator: q = Search(using=client, index=index).filter("terms", **{lookup_key: chunk_of_values}) # type: Search q.update_from_dict({"size": config["max_query_size"]}) response = q.execute() if response["hits"]["total"]["value"] != 0: award_key_list += [ x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD] for x in response["hits"]["hits"] ] return award_key_list