Python ETLTable示例，usaspending_api.common.etl.ETLTable Python示例

示例#1

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_delete_obsolete_rows(operations_fixture):
    assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2),
                                                                   (4, 5),
                                                                   (9, 9)]
    operations.delete_obsolete_rows(ETLTable("t1"), ETLTable("t2"))
    assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2),
                                                                   (4, 5)]

示例#2

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_stage_table(operations_fixture):
    # Just copies the table.
    operations.stage_table(ETLTable("t1"), ETLTable("t2"),
                           ETLTemporaryTable("t3"))
    assert execute_sql("select id1, id2 from t3 order by id1") == [(1, 2),
                                                                   (4, 5),
                                                                   (8, 8)]

示例#3

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_insert_missing_rows(operations_fixture):
    assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2),
                                                                   (4, 5),
                                                                   (9, 9)]
    operations.insert_missing_rows(ETLTable("t1"), ETLTable("t2"))
    assert execute_sql("select id1, id2 from t2 order by id1") == [(1, 2),
                                                                   (4, 5),
                                                                   (8, 8),
                                                                   (9, 9)]

示例#4

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_update_changed_rows(operations_fixture):
    assert execute_sql(
        "select id1, id2, name, description from t2 order by id1") == [
            (1, 2, "three", "four"),
            (4, 5, "not six", "not seven"),
            (9, 9, "nine", "nine"),
        ]
    operations.update_changed_rows(ETLTable("t1"), ETLTable("t2"))
    assert execute_sql(
        "select id1, id2, name, description from t2 order by id1") == [
            (1, 2, "three", "four"),
            (4, 5, "six", "seven"),
            (9, 9, "nine", "nine"),
        ]

示例#5

0

显示文件

文件： load_disaster_emergency_fund_codes.py 项目： umeshh/usaspending-api

    def _perform_load(self):

        overrides = {
            "insert_overrides": {
                "create_date": SQL("now()"),
                "update_date": SQL("now()")
            },
            "update_overrides": {
                "update_date": SQL("now()")
            },
        }

        def_code_table = ETLTable("disaster_emergency_fund_code", **overrides)
        temp_def_code_table = ETLTemporaryTable(
            "temp_load_disaster_emergency_fund_codes")

        self._execute_dml_sql(
            CREATE_TEMP_TABLE,
            "Create disaster_emergency_fund_code temp table")
        self._execute_function_and_log(self._read_raw_def_code_csv,
                                       "Read raw DEF Code csv")
        self._execute_function(self._validate_raw_def_codes,
                               "Validate raw DEF Codes")
        self._execute_function_and_log(self._import_def_codes,
                                       "Import DEF Codes")

        self._execute_function_and_log(update_changed_rows,
                                       "Update changed DEF Codes",
                                       temp_def_code_table, def_code_table)
        self._execute_function_and_log(insert_missing_rows,
                                       "Insert missing DEF Codes",
                                       temp_def_code_table, def_code_table)

示例#6

0

显示文件

文件： load_city_county_state_code.py 项目： lenjonemcse/usaspending-api

    def _perform_load(self):
        self._create_staging_table()
        self._import_input_file()

        natural_key_columns = ["feature_id", "state_alpha", "county_sequence", "county_numeric"]
        destination_table = ETLTable(self.destination_table_name, key_overrides=natural_key_columns)
        source_table = ETLTemporaryTable(self.staging_table_name)

        with Timer("Delete obsolete rows"):
            deleted_count = operations.delete_obsolete_rows(source_table, destination_table)
            logger.info(f"{deleted_count:,} rows deleted")

        with Timer("Update changed rows"):
            updated_count = operations.update_changed_rows(source_table, destination_table)
            logger.info(f"{updated_count:,} rows updated")

        with Timer("Insert missing rows"):
            inserted_count = operations.insert_missing_rows(source_table, destination_table)
            logger.info(f"{inserted_count:,} rows inserted")

        with Timer("Remove staging table"):
            execute_dml_sql(f'drop table if exists "{self.staging_table_name}"')

        change_count = deleted_count + updated_count + inserted_count
        current_record_count = CityCountyStateCode.objects.count()
        max_change_threshold = int(current_record_count * MAX_CHANGE_PERCENT / 100.0)
        if current_record_count > 0 and change_count > max_change_threshold and not self.force:
            raise RuntimeError(
                f"Exceeded maximum number of allowed changes {max_change_threshold:,} ({MAX_CHANGE_PERCENT}%).  "
                f"Use --force switch if this was intentional."
            )

        return change_count

示例#7

0

显示文件

文件： agnostic_transaction_loader.py 项目： JordanMurrayFedSav/usaspending-api

    def copy_broker_table_data(self, source_tablename, dest_tablename, primary_key):
        """Loop through the batches of IDs and load using the ETL tables"""
        destination = ETLTable(dest_tablename)
        source = ETLDBLinkTable(source_tablename, "broker_server", destination.data_types)
        transactions_remaining_count = self.total_ids_to_process

        for id_list in read_file_for_database_ids(str(self.file_path), self.chunk_size, is_numeric=False):
            with Timer(message="Batch upsert", success_logger=logger.info, failure_logger=logger.error):
                if len(id_list) != 0:
                    predicate = self.extra_predicate + [{"field": primary_key, "op": "IN", "values": tuple(id_list)}]
                    record_count = operations.upsert_records_with_predicate(source, destination, predicate, primary_key)
                else:
                    logger.warning("No records to load. Please check parameters and settings to confirm accuracy")
                    record_count = 0

            transactions_remaining_count -= len(id_list)
            logger.info(f"{record_count:,} successful upserts, {transactions_remaining_count:,} remaining.")
            self.upsert_records += record_count

示例#8

0

显示文件

文件： load_object_classes.py 项目： umeshh/usaspending-api

    def _perform_load(self):

        overrides = {
            "insert_overrides": {
                "create_date": SQL("now()"),
                "update_date": SQL("now()")
            },
            "update_overrides": {
                "update_date": SQL("now()")
            },
            "key_overrides": ["object_class", "direct_reimbursable"],
        }

        object_class_table = ETLTable("object_class", **overrides)
        temp_object_class_table = ETLTemporaryTable("temp_load_object_classes")

        self._execute_dml_sql(CREATE_TEMP_TABLE,
                              "Create object_class temp table")
        self._execute_function_and_log(self._read_raw_object_classes_csv,
                                       "Read raw object class csv")
        self._execute_function(self._validate_raw_object_classes,
                               "Validate raw object classes")
        self._execute_function(self._add_unknown_object_classes,
                               'Add "unknown" object classes')
        self._execute_function(self._derive_remaining_fields,
                               "Derive remaining fields")
        self._execute_function_and_log(self._import_object_classes,
                                       "Import object classes")

        # NOT deleting object classes is intentional for historical reasons.
        self._execute_function_and_log(update_changed_rows,
                                       "Update changed object classes",
                                       temp_object_class_table,
                                       object_class_table)
        self._execute_function_and_log(insert_missing_rows,
                                       "Insert missing object classes",
                                       temp_object_class_table,
                                       object_class_table)

示例#9

0

显示文件

    def copy_broker_table_data(self, source_tablename, dest_tablename,
                               primary_key):
        """Loop through the batches of IDs and load using the ETL tables"""
        destination = ETLTable(dest_tablename)
        source = ETLDBLinkTable(source_tablename,
                                settings.DATA_BROKER_DBLINK_NAME,
                                destination.data_types)
        transactions_remaining_count = self.total_ids_to_process

        for id_list in read_file_for_database_ids(str(self.file_path),
                                                  self.chunk_size,
                                                  is_numeric=False):
            with Timer(message=f"Upsert {len(id_list):,} records"):
                if len(id_list) != 0:
                    predicate = self.extra_predicate + [
                        {
                            "field": primary_key,
                            "op": "IN",
                            "values": tuple(id_list)
                        }
                    ]
                    record_count = operations.upsert_records_with_predicate(
                        source, destination, predicate, primary_key)
                else:
                    logger.warning(
                        "No records to load. Please check parameters and settings to confirm accuracy"
                    )
                    record_count = 0

            if transactions_remaining_count > len(id_list):
                transactions_remaining_count -= len(id_list)
            else:
                transactions_remaining_count = 0
            self.upsert_records += record_count
            percentage = self.upsert_records * 100 / self.total_ids_to_process if self.total_ids_to_process != 0 else 0
            logger.info(f"{self.upsert_records:,} successful upserts, "
                        f"{transactions_remaining_count:,} remaining. "
                        f"[{percentage:.2f}%]")

示例#10

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_identify_new_or_updated(operations_fixture):
    operations.identify_new_or_updated(ETLTable("t1"), ETLTable("t2"),
                                       ETLTemporaryTable("t3"))
    assert execute_sql("select id1, id2 from t3 order by id1") == [(4, 5),
                                                                   (8, 8)]

示例#11

0

显示文件

文件： test_etl.py 项目： umeshh/usaspending-api

def test_etl_table(make_a_table):

    expected_data_types = OrderedDict((
        ("id1", primatives.ColumnDefinition("id1", "integer", True)),
        ("id2", primatives.ColumnDefinition("id2", "integer", True)),
        ("name", primatives.ColumnDefinition("name", "text", False)),
        ("description",
         primatives.ColumnDefinition("description", "text", False)),
    ))

    # Happy path.
    table = ETLTable("totally_tubular_testing_table_for_two", "public")
    assert table.columns == ["id1", "id2", "name", "description"]
    assert dict(table.data_types) == expected_data_types
    assert table.update_overrides == {}
    assert table.insert_overrides == {}
    assert table.key_columns == [
        primatives.ColumnDefinition("id1", "integer", True),
        primatives.ColumnDefinition("id2", "integer", True),
    ]
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # No schema.
    table = ETLTable("totally_tubular_testing_table_for_two")
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # Happy overrides.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        key_overrides=["id1"],
        insert_overrides={"name": Literal("hello")},
        update_overrides={"description": Literal("o hi")},
    )
    assert table.columns == ["id1", "id2", "name", "description"]
    assert dict(table.data_types) == expected_data_types
    assert table.insert_overrides == {"name": Literal("hello")}
    assert table.update_overrides == {"description": Literal("o hi")}
    assert table.key_columns == [
        primatives.ColumnDefinition("id1", "integer", True)
    ]
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # Nonexistent overrides.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        key_overrides=["bogus"],
        insert_overrides={"bogus": SQL("now()")},
        update_overrides={"bogus": SQL("now()")},
    )
    with pytest.raises(RuntimeError):
        assert table.update_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.insert_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.key_columns is not None

    # Attempt to override primary keys.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        insert_overrides={"id1": SQL("now()")},
        update_overrides={"id1": SQL("now()")},
    )
    with pytest.raises(RuntimeError):
        assert table.update_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.insert_overrides is not None

    # Everything for ETLTemporaryTable is the same as ETLTable just no schema.
    table = ETLTemporaryTable("totally_tubular_testing_table_for_two")
    assert cc(table.object_representation
              ) == '"totally_tubular_testing_table_for_two"'

示例#12

0

显示文件

    def _perform_load(self):
        """ Grab the Broker subaward table and use it to update ours. """

        broker_subaward = ETLTable("broker_subaward")
        subaward = ETLTable("subaward")

        remote_subaward = ETLDBLinkTable("subaward",
                                         settings.DATA_BROKER_DBLINK_NAME,
                                         broker_subaward.data_types)

        temp_broker_subaward = ETLTemporaryTable(
            "temp_load_subawards_broker_subaward")
        temp_new_or_updated = ETLTemporaryTable(
            "temp_load_subawards_new_or_updated")
        temp_subaward = ETLTemporaryTable("temp_load_subawards_subaward")

        if self.full_reload:
            logger.info(
                "--full-reload switch provided.  Emptying subaward tables.")
            self._execute_dml_sql("delete from broker_subaward",
                                  "Empty broker_subaward table")
            self._execute_dml_sql("delete from subaward",
                                  "Empty subaward table")

        self._execute_function_and_log(
            operations.stage_table,
            "Copy Broker subaward table",
            source=remote_subaward,
            destination=broker_subaward,
            staging=temp_broker_subaward,
        )

        # To help performance.
        self._execute_dml_sql(
            "create index idx_temp_load_subawards_broker_subaward on temp_load_subawards_broker_subaward(id)",
            "Create temporary index",
        )

        # Create a list of new or updated subawards that we can use to filter down
        # subsequent operations.
        self._execute_function_and_log(
            operations.identify_new_or_updated,
            "Identify new/updated rows",
            source=temp_broker_subaward,
            destination=broker_subaward,
            staging=temp_new_or_updated,
        )

        self._delete_update_insert_rows("broker_subawards",
                                        temp_broker_subaward, broker_subaward)

        # The Broker subaward table takes up a good bit of space so let's explicitly free
        # it up before continuing.
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_broker_subaward",
            "Drop staging table")

        self._execute_etl_dml_sql_directory_file("030_frame_out_subawards")
        self._execute_etl_dml_sql_directory_file(
            "040_enhance_with_awards_data")
        self._execute_etl_dml_sql_directory_file(
            "050_enhance_with_transaction_fpds_data.sql")
        self._execute_etl_dml_sql_directory_file(
            "051_enhance_with_transaction_fabs_data.sql")
        self._execute_etl_dml_sql_directory_file("060_enhance_with_cfda_data")
        self._execute_etl_dml_sql_directory_file(
            "070_enhance_with_awarding_agency")
        self._execute_etl_dml_sql_directory_file(
            "080_enhance_with_funding_agency")

        with Timer("Enhance with city and county information"):
            update_subaward_city_county("temp_load_subawards_subaward")

        self._execute_etl_dml_sql_directory_file(
            "110_enhance_with_pop_country")
        self._execute_etl_dml_sql_directory_file(
            "130_enhance_with_recipient_location_country")

        # Delete from broker_subaward because temp_subaward only has new/updated rows.
        self._execute_function_and_log(operations.delete_obsolete_rows,
                                       "Delete obsolete subawards",
                                       broker_subaward, subaward)

        # But update and insert from temp_subaward.
        self._execute_function_and_log(operations.update_changed_rows,
                                       "Update changed subawards",
                                       temp_subaward, subaward)
        self._execute_function_and_log(operations.insert_missing_rows,
                                       "Insert missing subawards",
                                       temp_subaward, subaward)

        self._execute_etl_dml_sql_directory_file("140_link_awards")
        self._execute_etl_dml_sql_directory_file("150_update_awards")

        # This is for unit tests.  Temporary tables go away on their own once the transaction/session
        # drops, but because unit tests run in a transaction, our temporary tables do not go away
        # between loads... which is problematic.
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_new_or_updated",
            "Drop new/updated temp table")
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_subaward",
            "Drop subaward temp table")

示例#13

0

显示文件

文件： load_agencies.py 项目： umeshh/usaspending-api

    def _perform_load(self):

        overrides = {
            "insert_overrides": {
                "create_date": SQL("now()"),
                "update_date": SQL("now()")
            },
            "update_overrides": {
                "update_date": SQL("now()")
            },
        }

        agency_table = ETLTable(
            "agency",
            key_overrides=["toptier_agency_id", "subtier_agency_id"],
            **overrides)
        cgac_table = ETLTable("cgac", key_overrides=["cgac_code"])
        frec_table = ETLTable("frec", key_overrides=["frec_code"])
        subtier_agency_table = ETLTable("subtier_agency",
                                        key_overrides=["subtier_code"],
                                        **overrides)
        toptier_agency_table = ETLTable("toptier_agency",
                                        key_overrides=["toptier_code"],
                                        **overrides)

        agency_query = ETLQueryFile(self.etl_dml_sql_directory /
                                    "agency_query.sql",
                                    temp_table=TEMP_TABLE_NAME)
        cgac_query = ETLQueryFile(self.etl_dml_sql_directory /
                                  "cgac_query.sql",
                                  temp_table=TEMP_TABLE_NAME)
        frec_query = ETLQueryFile(self.etl_dml_sql_directory /
                                  "frec_query.sql",
                                  temp_table=TEMP_TABLE_NAME)
        subtier_agency_query = ETLQueryFile(self.etl_dml_sql_directory /
                                            "subtier_agency_query.sql",
                                            temp_table=TEMP_TABLE_NAME)
        toptier_agency_query = ETLQueryFile(
            self.etl_dml_sql_directory / "toptier_agency_query.sql",
            temp_table=TEMP_TABLE_NAME,
            dod_subsumed=DOD_SUBSUMED_AIDS,
        )

        path = self._get_sql_directory_file_path(
            "raw_agency_create_temp_table")
        sql = path.read_text().format(temp_table=TEMP_TABLE_NAME)
        self._execute_dml_sql(sql, "Create raw agency temp table")
        self._execute_function_and_log(self._read_raw_agencies_csv,
                                       "Read raw agencies csv")
        self._execute_function_and_log(self._import_raw_agencies,
                                       "Import raw agencies")
        self._execute_function(self._perform_validations,
                               "Perform validations")

        rows_affected = 0

        rows_affected += self._delete_update_insert_rows(
            "CGACs", cgac_query, cgac_table)
        rows_affected += self._delete_update_insert_rows(
            "FRECs", frec_query, frec_table)

        rows_affected += self._delete_update_insert_rows(
            "toptier agencies", toptier_agency_query, toptier_agency_table)
        rows_affected += self._delete_update_insert_rows(
            "subtier agencies", subtier_agency_query, subtier_agency_table)
        rows_affected += self._delete_update_insert_rows(
            "agencies", agency_query, agency_table)

        if rows_affected > MAX_CHANGES and not self.force:
            raise RuntimeError(
                f"Exceeded maximum number of allowed changes ({MAX_CHANGES:,}).  Use --force switch if this "
                f"was intentional.")

        elif rows_affected > 0 or self.force:
            self._execute_function_and_log(
                update_treasury_appropriation_account_agencies,
                "Update treasury appropriation accounts")
            self._execute_function_and_log(update_federal_account_agency,
                                           "Update federal accounts")
            self._execute_etl_dml_sql_directory_file(
                "transaction_normalized_update", "Update transactions")
            self._execute_etl_dml_sql_directory_file("award_update",
                                                     "Update awards")
            self._execute_etl_dml_sql_directory_file("subaward_update",
                                                     "Update subawards")

        else:
            logger.info(
                "Skipping treasury_appropriation_account, transaction_normalized, "
                "awards, and subaward updates since there were no agency changes."
            )

示例#14

0

显示文件

    from
        treasury_appropriation_account
    order by
        agency_id,
        main_account_code,
        beginning_period_of_availability desc nulls last,
        ending_period_of_availability desc nulls last,
        sub_account_code,
        allocation_transfer_agency_id,
        treasury_account_identifier desc
"""


source_federal_account_query = ETLQuery(FEDERAL_ACCOUNTS_FROM_TREASURY_ACCOUNTS_SQL)
destination_federal_account_table = ETLTable(
    "federal_account", key_overrides=["agency_identifier", "main_account_code"]
)


def remove_empty_federal_accounts():
    """
    Removes federal accounts that are no longer attached to a TAS.

    Returns:
        Number of rows updated
    """
    return delete_obsolete_rows(source_federal_account_query, destination_federal_account_table)


def update_federal_accounts():
    """