def test_get_id_range_for_partition_with_one_over(): """Checks that the proper upper and lower bound are retrieved even when the range of IDs leaves only 1 item in the last partition. There was a bug here before.""" min_id = 1 max_id = 101 partition_size = 20 id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range assert id_range_item_count % partition_size == 1 # one over the partition size etl_config = {"partition_size": partition_size} ctrl = Controller(etl_config) ctrl.min_id = min_id ctrl.max_id = max_id ctrl.record_count = id_range_item_count # assume records exist for each ID in range ctrl.config["partitions"] = ctrl.determine_partitions() assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) partition_range = range(0, ctrl.config["partitions"]) # First batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[0]) assert lower_bound == min_id assert upper_bound == lower_bound + (partition_size - 1) # Second batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[1]) assert lower_bound == min_id + partition_size assert upper_bound == lower_bound + (partition_size - 1) # Last batch should go all the way up to max_id lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[-1]) assert lower_bound == (min_id + (partition_size * partition_range[-1])) == 101 assert upper_bound == max_id == 101 id_set = set(range(min_id, max_id + 1)) assert _remove_seen_ids(ctrl, id_set) == set({})
def test_get_id_range_for_partition_with_evenly_divisible(): """Check all is good when set of records fit evenly into partitions (each partition full)""" min_id = 1 max_id = 100 partition_size = 20 id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range assert id_range_item_count % partition_size == 0 # evenly divisible etl_config = {"partition_size": partition_size} ctrl = Controller(etl_config) ctrl.min_id = min_id ctrl.max_id = max_id ctrl.record_count = id_range_item_count # assume records exist for each ID in range ctrl.config["partitions"] = ctrl.determine_partitions() assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) partition_range = range(0, ctrl.config["partitions"]) # First batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[0]) assert lower_bound == min_id assert upper_bound == lower_bound + (partition_size - 1) # Second batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[1]) assert lower_bound == min_id + partition_size assert upper_bound == lower_bound + (partition_size - 1) # Last batch should go all the way up to max_id lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[-1]) assert lower_bound == (max_id - partition_size + 1) == (min_id + (partition_size * partition_range[-1])) assert upper_bound == max_id id_set = set(range(min_id, max_id + 1)) assert _remove_seen_ids(ctrl, id_set) == set({})
def test_get_id_range_for_partition_with_empty_partitions(): """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by the partition size. There was a bug here before.""" min_id = 1 max_id = 100 partition_size = 20 id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range record_ids = {1, 5, 7, 15, 19, 20, 41, 100} etl_config = {"partition_size": partition_size} ctrl = Controller(etl_config) ctrl.min_id = min_id ctrl.max_id = max_id ctrl.record_count = len(record_ids) ctrl.config["partitions"] = ctrl.determine_partitions() assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) partition_range = range(0, ctrl.config["partitions"]) # First batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[0]) assert lower_bound == min_id assert upper_bound == lower_bound + (partition_size - 1) # Second batch lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[1]) assert lower_bound == min_id + partition_size assert upper_bound == lower_bound + (partition_size - 1) # Last batch should go all the way up to max_id lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[-1]) assert lower_bound == (min_id + (partition_size * partition_range[-1])) assert upper_bound == max_id assert _remove_seen_ids(ctrl, record_ids) == set({})
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, monkeypatch): monkeypatch.setattr( "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement", mock_execute_sql) elasticsearch_client = instantiate_elasticsearch_client() loader = Controller(award_config, elasticsearch_client) assert loader.__class__.__name__ == "Controller" loader.run_load_steps() assert elasticsearch_client.indices.exists(award_config["index_name"]) elasticsearch_client.indices.delete(index=award_config["index_name"], ignore_unavailable=False)
def handle(self, *args, **options): elasticsearch_client = instantiate_elasticsearch_client() config = parse_cli_args(options, elasticsearch_client) start = perf_counter() logger.info(format_log(f"Starting script\n{'=' * 56}")) start_msg = "target index: {index_name} | Starting from: {starting_date}" logger.info(format_log(start_msg.format(**config))) ensure_view_exists(config["sql_view"], force=True) error_addition = "" loader = Controller(config) if config["is_incremental_load"]: toggle_refresh_off(elasticsearch_client, config["index_name"]) # Turned back on at end. try: if config["process_deletes"]: loader.run_deletes() if not config["deletes_only"]: loader.prepare_for_etl() loader.dispatch_tasks() except Exception as e: logger.error(f"{str(e)}") error_addition = "before encountering a problem during execution.... " raise SystemExit(1) else: loader.complete_process() if config["drop_db_view"]: logger.info( format_log(f"Dropping SQL view '{config['sql_view']}'")) drop_etl_view(config["sql_view"], True) finally: msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|" headers = f"{'-' * (len(msg) - 2)} |" logger.info(format_log(headers)) logger.info(format_log(msg)) logger.info(format_log(headers)) # Used to help pipeline determine when job passed but needs attention if config["raise_status_code_3"]: raise SystemExit(3)
def test_create_and_load_new_award_index(award_data_fixture, elasticsearch_award_index, monkeypatch): """Test the ``elasticsearch_loader`` django management command to create a new awards index and load it with data from the DB """ client = elasticsearch_award_index.client # type: Elasticsearch # Ensure index is not yet created assert not client.indices.exists(elasticsearch_award_index.index_name) original_db_awards_count = Award.objects.count() # Inject ETL arg into config for this run, which loads a newly created index elasticsearch_award_index.etl_config["create_new_index"] = True es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) # Must use mock sql function to share test DB conn+transaction in ETL code # Patching on the module into which it is imported, not the module where it is defined monkeypatch.setattr( "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql) # Also override SQL function listed in config object with the mock one es_etl_config["execute_sql_func"] = mock_execute_sql loader = Controller(es_etl_config) assert loader.__class__.__name__ == "Controller" loader.prepare_for_etl() loader.dispatch_tasks() # Along with other things, this will refresh the index, to surface loaded docs set_final_index_config(client, elasticsearch_award_index.index_name) assert client.indices.exists(elasticsearch_award_index.index_name) es_award_docs = client.count( index=elasticsearch_award_index.index_name)["count"] assert es_award_docs == original_db_awards_count
def test_get_id_range_for_partition_one_records(): min_id = 1 max_id = 1 id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range etl_config = {"partition_size": 10000} ctrl = Controller(etl_config) ctrl.min_id = min_id ctrl.max_id = max_id ctrl.record_count = id_range_item_count # assume records exist for each ID in range ctrl.config["partitions"] = ctrl.determine_partitions() partition_range = range(0, ctrl.config["partitions"]) lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[0]) assert lower_bound == min_id assert upper_bound == max_id lower_bound, upper_bound = ctrl.get_id_range_for_partition( partition_range[-1]) assert lower_bound == min_id assert upper_bound == max_id id_set = set(range(min_id, max_id + 1)) assert _remove_seen_ids(ctrl, id_set) == set({})
def test_incremental_load_into_award_index(award_data_fixture, elasticsearch_award_index, monkeypatch): """Test the ``elasticsearch_loader`` django management command to incrementally load updated data into the awards ES index from the DB, overwriting the doc that was already there """ original_db_awards_count = Award.objects.count() elasticsearch_award_index.update_index() client = elasticsearch_award_index.client # type: Elasticsearch assert client.indices.exists(elasticsearch_award_index.index_name) es_award_docs = client.count( index=elasticsearch_award_index.index_name)["count"] assert es_award_docs == original_db_awards_count # Inject ETL arg into config for this run, to suppress processing deletes. Test incremental load only elasticsearch_award_index.etl_config["process_deletes"] = False elasticsearch_award_index.etl_config["start_datetime"] = datetime.now( timezone.utc) es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) # Now modify one of the DB objects awd = Award.objects.first() # type: Award awd.total_obligation = 9999 awd.save() # Must use mock sql function to share test DB conn+transaction in ETL code # Patching on the module into which it is imported, not the module where it is defined monkeypatch.setattr( "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql) # Also override SQL function listed in config object with the mock one es_etl_config["execute_sql_func"] = mock_execute_sql ensure_view_exists(es_etl_config["sql_view"], force=True) loader = Controller(es_etl_config) assert loader.__class__.__name__ == "Controller" loader.prepare_for_etl() loader.dispatch_tasks() client.indices.refresh(elasticsearch_award_index.index_name) assert client.indices.exists(elasticsearch_award_index.index_name) es_award_docs = client.count( index=elasticsearch_award_index.index_name)["count"] assert es_award_docs == original_db_awards_count es_awards = client.search(index=elasticsearch_award_index.index_name) updated_award = [ a for a in es_awards["hits"]["hits"] if a["_source"]["award_id"] == awd.id ][0] assert int(updated_award["_source"]["total_obligation"]) == 9999