def test_delete_obsolete_rows(operations_fixture): assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2), (4, 5), (9, 9)] operations.delete_obsolete_rows(ETLTable("t1"), ETLTable("t2")) assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2), (4, 5)]
def test_stage_table(operations_fixture): # Just copies the table. operations.stage_table(ETLTable("t1"), ETLTable("t2"), ETLTemporaryTable("t3")) assert execute_sql("select id1, id2 from t3 order by id1") == [(1, 2), (4, 5), (8, 8)]
def test_insert_missing_rows(operations_fixture): assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2), (4, 5), (9, 9)] operations.insert_missing_rows(ETLTable("t1"), ETLTable("t2")) assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2), (4, 5), (8, 8), (9, 9)]
def test_update_changed_rows(operations_fixture): assert execute_sql( "select id1, id2, name, description from t2 order by id1") == [ (1, 2, "three", "four"), (4, 5, "not six", "not seven"), (9, 9, "nine", "nine"), ] operations.update_changed_rows(ETLTable("t1"), ETLTable("t2")) assert execute_sql( "select id1, id2, name, description from t2 order by id1") == [ (1, 2, "three", "four"), (4, 5, "six", "seven"), (9, 9, "nine", "nine"), ]
def _perform_load(self): overrides = { "insert_overrides": { "create_date": SQL("now()"), "update_date": SQL("now()") }, "update_overrides": { "update_date": SQL("now()") }, } def_code_table = ETLTable("disaster_emergency_fund_code", **overrides) temp_def_code_table = ETLTemporaryTable( "temp_load_disaster_emergency_fund_codes") self._execute_dml_sql( CREATE_TEMP_TABLE, "Create disaster_emergency_fund_code temp table") self._execute_function_and_log(self._read_raw_def_code_csv, "Read raw DEF Code csv") self._execute_function(self._validate_raw_def_codes, "Validate raw DEF Codes") self._execute_function_and_log(self._import_def_codes, "Import DEF Codes") self._execute_function_and_log(update_changed_rows, "Update changed DEF Codes", temp_def_code_table, def_code_table) self._execute_function_and_log(insert_missing_rows, "Insert missing DEF Codes", temp_def_code_table, def_code_table)
def _perform_load(self): self._create_staging_table() self._import_input_file() natural_key_columns = ["feature_id", "state_alpha", "county_sequence", "county_numeric"] destination_table = ETLTable(self.destination_table_name, key_overrides=natural_key_columns) source_table = ETLTemporaryTable(self.staging_table_name) with Timer("Delete obsolete rows"): deleted_count = operations.delete_obsolete_rows(source_table, destination_table) logger.info(f"{deleted_count:,} rows deleted") with Timer("Update changed rows"): updated_count = operations.update_changed_rows(source_table, destination_table) logger.info(f"{updated_count:,} rows updated") with Timer("Insert missing rows"): inserted_count = operations.insert_missing_rows(source_table, destination_table) logger.info(f"{inserted_count:,} rows inserted") with Timer("Remove staging table"): execute_dml_sql(f'drop table if exists "{self.staging_table_name}"') change_count = deleted_count + updated_count + inserted_count current_record_count = CityCountyStateCode.objects.count() max_change_threshold = int(current_record_count * MAX_CHANGE_PERCENT / 100.0) if current_record_count > 0 and change_count > max_change_threshold and not self.force: raise RuntimeError( f"Exceeded maximum number of allowed changes {max_change_threshold:,} ({MAX_CHANGE_PERCENT}%). " f"Use --force switch if this was intentional." ) return change_count
def copy_broker_table_data(self, source_tablename, dest_tablename, primary_key): """Loop through the batches of IDs and load using the ETL tables""" destination = ETLTable(dest_tablename) source = ETLDBLinkTable(source_tablename, "broker_server", destination.data_types) transactions_remaining_count = self.total_ids_to_process for id_list in read_file_for_database_ids(str(self.file_path), self.chunk_size, is_numeric=False): with Timer(message="Batch upsert", success_logger=logger.info, failure_logger=logger.error): if len(id_list) != 0: predicate = self.extra_predicate + [{"field": primary_key, "op": "IN", "values": tuple(id_list)}] record_count = operations.upsert_records_with_predicate(source, destination, predicate, primary_key) else: logger.warning("No records to load. Please check parameters and settings to confirm accuracy") record_count = 0 transactions_remaining_count -= len(id_list) logger.info(f"{record_count:,} successful upserts, {transactions_remaining_count:,} remaining.") self.upsert_records += record_count
def _perform_load(self): overrides = { "insert_overrides": { "create_date": SQL("now()"), "update_date": SQL("now()") }, "update_overrides": { "update_date": SQL("now()") }, "key_overrides": ["object_class", "direct_reimbursable"], } object_class_table = ETLTable("object_class", **overrides) temp_object_class_table = ETLTemporaryTable("temp_load_object_classes") self._execute_dml_sql(CREATE_TEMP_TABLE, "Create object_class temp table") self._execute_function_and_log(self._read_raw_object_classes_csv, "Read raw object class csv") self._execute_function(self._validate_raw_object_classes, "Validate raw object classes") self._execute_function(self._add_unknown_object_classes, 'Add "unknown" object classes') self._execute_function(self._derive_remaining_fields, "Derive remaining fields") self._execute_function_and_log(self._import_object_classes, "Import object classes") # NOT deleting object classes is intentional for historical reasons. self._execute_function_and_log(update_changed_rows, "Update changed object classes", temp_object_class_table, object_class_table) self._execute_function_and_log(insert_missing_rows, "Insert missing object classes", temp_object_class_table, object_class_table)
def copy_broker_table_data(self, source_tablename, dest_tablename, primary_key): """Loop through the batches of IDs and load using the ETL tables""" destination = ETLTable(dest_tablename) source = ETLDBLinkTable(source_tablename, settings.DATA_BROKER_DBLINK_NAME, destination.data_types) transactions_remaining_count = self.total_ids_to_process for id_list in read_file_for_database_ids(str(self.file_path), self.chunk_size, is_numeric=False): with Timer(message=f"Upsert {len(id_list):,} records"): if len(id_list) != 0: predicate = self.extra_predicate + [ { "field": primary_key, "op": "IN", "values": tuple(id_list) } ] record_count = operations.upsert_records_with_predicate( source, destination, predicate, primary_key) else: logger.warning( "No records to load. Please check parameters and settings to confirm accuracy" ) record_count = 0 if transactions_remaining_count > len(id_list): transactions_remaining_count -= len(id_list) else: transactions_remaining_count = 0 self.upsert_records += record_count percentage = self.upsert_records * 100 / self.total_ids_to_process if self.total_ids_to_process != 0 else 0 logger.info(f"{self.upsert_records:,} successful upserts, " f"{transactions_remaining_count:,} remaining. " f"[{percentage:.2f}%]")
def test_identify_new_or_updated(operations_fixture): operations.identify_new_or_updated(ETLTable("t1"), ETLTable("t2"), ETLTemporaryTable("t3")) assert execute_sql("select id1, id2 from t3 order by id1") == [(4, 5), (8, 8)]
def test_etl_table(make_a_table): expected_data_types = OrderedDict(( ("id1", primatives.ColumnDefinition("id1", "integer", True)), ("id2", primatives.ColumnDefinition("id2", "integer", True)), ("name", primatives.ColumnDefinition("name", "text", False)), ("description", primatives.ColumnDefinition("description", "text", False)), )) # Happy path. table = ETLTable("totally_tubular_testing_table_for_two", "public") assert table.columns == ["id1", "id2", "name", "description"] assert dict(table.data_types) == expected_data_types assert table.update_overrides == {} assert table.insert_overrides == {} assert table.key_columns == [ primatives.ColumnDefinition("id1", "integer", True), primatives.ColumnDefinition("id2", "integer", True), ] assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # No schema. table = ETLTable("totally_tubular_testing_table_for_two") assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # Happy overrides. table = ETLTable( "totally_tubular_testing_table_for_two", "public", key_overrides=["id1"], insert_overrides={"name": Literal("hello")}, update_overrides={"description": Literal("o hi")}, ) assert table.columns == ["id1", "id2", "name", "description"] assert dict(table.data_types) == expected_data_types assert table.insert_overrides == {"name": Literal("hello")} assert table.update_overrides == {"description": Literal("o hi")} assert table.key_columns == [ primatives.ColumnDefinition("id1", "integer", True) ] assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # Nonexistent overrides. table = ETLTable( "totally_tubular_testing_table_for_two", "public", key_overrides=["bogus"], insert_overrides={"bogus": SQL("now()")}, update_overrides={"bogus": SQL("now()")}, ) with pytest.raises(RuntimeError): assert table.update_overrides is not None with pytest.raises(RuntimeError): assert table.insert_overrides is not None with pytest.raises(RuntimeError): assert table.key_columns is not None # Attempt to override primary keys. table = ETLTable( "totally_tubular_testing_table_for_two", "public", insert_overrides={"id1": SQL("now()")}, update_overrides={"id1": SQL("now()")}, ) with pytest.raises(RuntimeError): assert table.update_overrides is not None with pytest.raises(RuntimeError): assert table.insert_overrides is not None # Everything for ETLTemporaryTable is the same as ETLTable just no schema. table = ETLTemporaryTable("totally_tubular_testing_table_for_two") assert cc(table.object_representation ) == '"totally_tubular_testing_table_for_two"'
def _perform_load(self): """ Grab the Broker subaward table and use it to update ours. """ broker_subaward = ETLTable("broker_subaward") subaward = ETLTable("subaward") remote_subaward = ETLDBLinkTable("subaward", settings.DATA_BROKER_DBLINK_NAME, broker_subaward.data_types) temp_broker_subaward = ETLTemporaryTable( "temp_load_subawards_broker_subaward") temp_new_or_updated = ETLTemporaryTable( "temp_load_subawards_new_or_updated") temp_subaward = ETLTemporaryTable("temp_load_subawards_subaward") if self.full_reload: logger.info( "--full-reload switch provided. Emptying subaward tables.") self._execute_dml_sql("delete from broker_subaward", "Empty broker_subaward table") self._execute_dml_sql("delete from subaward", "Empty subaward table") self._execute_function_and_log( operations.stage_table, "Copy Broker subaward table", source=remote_subaward, destination=broker_subaward, staging=temp_broker_subaward, ) # To help performance. self._execute_dml_sql( "create index idx_temp_load_subawards_broker_subaward on temp_load_subawards_broker_subaward(id)", "Create temporary index", ) # Create a list of new or updated subawards that we can use to filter down # subsequent operations. self._execute_function_and_log( operations.identify_new_or_updated, "Identify new/updated rows", source=temp_broker_subaward, destination=broker_subaward, staging=temp_new_or_updated, ) self._delete_update_insert_rows("broker_subawards", temp_broker_subaward, broker_subaward) # The Broker subaward table takes up a good bit of space so let's explicitly free # it up before continuing. self._execute_dml_sql( "drop table if exists temp_load_subawards_broker_subaward", "Drop staging table") self._execute_etl_dml_sql_directory_file("030_frame_out_subawards") self._execute_etl_dml_sql_directory_file( "040_enhance_with_awards_data") self._execute_etl_dml_sql_directory_file( "050_enhance_with_transaction_fpds_data.sql") self._execute_etl_dml_sql_directory_file( "051_enhance_with_transaction_fabs_data.sql") self._execute_etl_dml_sql_directory_file("060_enhance_with_cfda_data") self._execute_etl_dml_sql_directory_file( "070_enhance_with_awarding_agency") self._execute_etl_dml_sql_directory_file( "080_enhance_with_funding_agency") with Timer("Enhance with city and county information"): update_subaward_city_county("temp_load_subawards_subaward") self._execute_etl_dml_sql_directory_file( "110_enhance_with_pop_country") self._execute_etl_dml_sql_directory_file( "130_enhance_with_recipient_location_country") # Delete from broker_subaward because temp_subaward only has new/updated rows. self._execute_function_and_log(operations.delete_obsolete_rows, "Delete obsolete subawards", broker_subaward, subaward) # But update and insert from temp_subaward. self._execute_function_and_log(operations.update_changed_rows, "Update changed subawards", temp_subaward, subaward) self._execute_function_and_log(operations.insert_missing_rows, "Insert missing subawards", temp_subaward, subaward) self._execute_etl_dml_sql_directory_file("140_link_awards") self._execute_etl_dml_sql_directory_file("150_update_awards") # This is for unit tests. Temporary tables go away on their own once the transaction/session # drops, but because unit tests run in a transaction, our temporary tables do not go away # between loads... which is problematic. self._execute_dml_sql( "drop table if exists temp_load_subawards_new_or_updated", "Drop new/updated temp table") self._execute_dml_sql( "drop table if exists temp_load_subawards_subaward", "Drop subaward temp table")
def _perform_load(self): overrides = { "insert_overrides": { "create_date": SQL("now()"), "update_date": SQL("now()") }, "update_overrides": { "update_date": SQL("now()") }, } agency_table = ETLTable( "agency", key_overrides=["toptier_agency_id", "subtier_agency_id"], **overrides) cgac_table = ETLTable("cgac", key_overrides=["cgac_code"]) frec_table = ETLTable("frec", key_overrides=["frec_code"]) subtier_agency_table = ETLTable("subtier_agency", key_overrides=["subtier_code"], **overrides) toptier_agency_table = ETLTable("toptier_agency", key_overrides=["toptier_code"], **overrides) agency_query = ETLQueryFile(self.etl_dml_sql_directory / "agency_query.sql", temp_table=TEMP_TABLE_NAME) cgac_query = ETLQueryFile(self.etl_dml_sql_directory / "cgac_query.sql", temp_table=TEMP_TABLE_NAME) frec_query = ETLQueryFile(self.etl_dml_sql_directory / "frec_query.sql", temp_table=TEMP_TABLE_NAME) subtier_agency_query = ETLQueryFile(self.etl_dml_sql_directory / "subtier_agency_query.sql", temp_table=TEMP_TABLE_NAME) toptier_agency_query = ETLQueryFile( self.etl_dml_sql_directory / "toptier_agency_query.sql", temp_table=TEMP_TABLE_NAME, dod_subsumed=DOD_SUBSUMED_AIDS, ) path = self._get_sql_directory_file_path( "raw_agency_create_temp_table") sql = path.read_text().format(temp_table=TEMP_TABLE_NAME) self._execute_dml_sql(sql, "Create raw agency temp table") self._execute_function_and_log(self._read_raw_agencies_csv, "Read raw agencies csv") self._execute_function_and_log(self._import_raw_agencies, "Import raw agencies") self._execute_function(self._perform_validations, "Perform validations") rows_affected = 0 rows_affected += self._delete_update_insert_rows( "CGACs", cgac_query, cgac_table) rows_affected += self._delete_update_insert_rows( "FRECs", frec_query, frec_table) rows_affected += self._delete_update_insert_rows( "toptier agencies", toptier_agency_query, toptier_agency_table) rows_affected += self._delete_update_insert_rows( "subtier agencies", subtier_agency_query, subtier_agency_table) rows_affected += self._delete_update_insert_rows( "agencies", agency_query, agency_table) if rows_affected > MAX_CHANGES and not self.force: raise RuntimeError( f"Exceeded maximum number of allowed changes ({MAX_CHANGES:,}). Use --force switch if this " f"was intentional.") elif rows_affected > 0 or self.force: self._execute_function_and_log( update_treasury_appropriation_account_agencies, "Update treasury appropriation accounts") self._execute_function_and_log(update_federal_account_agency, "Update federal accounts") self._execute_etl_dml_sql_directory_file( "transaction_normalized_update", "Update transactions") self._execute_etl_dml_sql_directory_file("award_update", "Update awards") self._execute_etl_dml_sql_directory_file("subaward_update", "Update subawards") else: logger.info( "Skipping treasury_appropriation_account, transaction_normalized, " "awards, and subaward updates since there were no agency changes." )
from treasury_appropriation_account order by agency_id, main_account_code, beginning_period_of_availability desc nulls last, ending_period_of_availability desc nulls last, sub_account_code, allocation_transfer_agency_id, treasury_account_identifier desc """ source_federal_account_query = ETLQuery(FEDERAL_ACCOUNTS_FROM_TREASURY_ACCOUNTS_SQL) destination_federal_account_table = ETLTable( "federal_account", key_overrides=["agency_identifier", "main_account_code"] ) def remove_empty_federal_accounts(): """ Removes federal accounts that are no longer attached to a TAS. Returns: Number of rows updated """ return delete_obsolete_rows(source_federal_account_query, destination_federal_account_table) def update_federal_accounts(): """