Python ETLTemporaryTable 예제들, usaspending_api.common.etl.ETLTemporaryTable Python 예제들

예제 #1

0

파일 보기

파일: load_disaster_emergency_fund_codes.py 프로젝트: umeshh/usaspending-api

    def _perform_load(self):

        overrides = {
            "insert_overrides": {
                "create_date": SQL("now()"),
                "update_date": SQL("now()")
            },
            "update_overrides": {
                "update_date": SQL("now()")
            },
        }

        def_code_table = ETLTable("disaster_emergency_fund_code", **overrides)
        temp_def_code_table = ETLTemporaryTable(
            "temp_load_disaster_emergency_fund_codes")

        self._execute_dml_sql(
            CREATE_TEMP_TABLE,
            "Create disaster_emergency_fund_code temp table")
        self._execute_function_and_log(self._read_raw_def_code_csv,
                                       "Read raw DEF Code csv")
        self._execute_function(self._validate_raw_def_codes,
                               "Validate raw DEF Codes")
        self._execute_function_and_log(self._import_def_codes,
                                       "Import DEF Codes")

        self._execute_function_and_log(update_changed_rows,
                                       "Update changed DEF Codes",
                                       temp_def_code_table, def_code_table)
        self._execute_function_and_log(insert_missing_rows,
                                       "Insert missing DEF Codes",
                                       temp_def_code_table, def_code_table)

예제 #2

0

파일 보기

파일: test_etl.py 프로젝트: umeshh/usaspending-api

def test_stage_table(operations_fixture):
    # Just copies the table.
    operations.stage_table(ETLTable("t1"), ETLTable("t2"),
                           ETLTemporaryTable("t3"))
    assert execute_sql("select id1, id2 from t3 order by id1") == [(1, 2),
                                                                   (4, 5),
                                                                   (8, 8)]

예제 #3

0

파일 보기

파일: load_city_county_state_code.py 프로젝트: lenjonemcse/usaspending-api

    def _perform_load(self):
        self._create_staging_table()
        self._import_input_file()

        natural_key_columns = ["feature_id", "state_alpha", "county_sequence", "county_numeric"]
        destination_table = ETLTable(self.destination_table_name, key_overrides=natural_key_columns)
        source_table = ETLTemporaryTable(self.staging_table_name)

        with Timer("Delete obsolete rows"):
            deleted_count = operations.delete_obsolete_rows(source_table, destination_table)
            logger.info(f"{deleted_count:,} rows deleted")

        with Timer("Update changed rows"):
            updated_count = operations.update_changed_rows(source_table, destination_table)
            logger.info(f"{updated_count:,} rows updated")

        with Timer("Insert missing rows"):
            inserted_count = operations.insert_missing_rows(source_table, destination_table)
            logger.info(f"{inserted_count:,} rows inserted")

        with Timer("Remove staging table"):
            execute_dml_sql(f'drop table if exists "{self.staging_table_name}"')

        change_count = deleted_count + updated_count + inserted_count
        current_record_count = CityCountyStateCode.objects.count()
        max_change_threshold = int(current_record_count * MAX_CHANGE_PERCENT / 100.0)
        if current_record_count > 0 and change_count > max_change_threshold and not self.force:
            raise RuntimeError(
                f"Exceeded maximum number of allowed changes {max_change_threshold:,} ({MAX_CHANGE_PERCENT}%).  "
                f"Use --force switch if this was intentional."
            )

        return change_count

예제 #4

0

파일 보기

파일: load_object_classes.py 프로젝트: umeshh/usaspending-api

    def _perform_load(self):

        overrides = {
            "insert_overrides": {
                "create_date": SQL("now()"),
                "update_date": SQL("now()")
            },
            "update_overrides": {
                "update_date": SQL("now()")
            },
            "key_overrides": ["object_class", "direct_reimbursable"],
        }

        object_class_table = ETLTable("object_class", **overrides)
        temp_object_class_table = ETLTemporaryTable("temp_load_object_classes")

        self._execute_dml_sql(CREATE_TEMP_TABLE,
                              "Create object_class temp table")
        self._execute_function_and_log(self._read_raw_object_classes_csv,
                                       "Read raw object class csv")
        self._execute_function(self._validate_raw_object_classes,
                               "Validate raw object classes")
        self._execute_function(self._add_unknown_object_classes,
                               'Add "unknown" object classes')
        self._execute_function(self._derive_remaining_fields,
                               "Derive remaining fields")
        self._execute_function_and_log(self._import_object_classes,
                                       "Import object classes")

        # NOT deleting object classes is intentional for historical reasons.
        self._execute_function_and_log(update_changed_rows,
                                       "Update changed object classes",
                                       temp_object_class_table,
                                       object_class_table)
        self._execute_function_and_log(insert_missing_rows,
                                       "Insert missing object classes",
                                       temp_object_class_table,
                                       object_class_table)

예제 #5

0

파일 보기

파일: test_etl.py 프로젝트: umeshh/usaspending-api

def test_identify_new_or_updated(operations_fixture):
    operations.identify_new_or_updated(ETLTable("t1"), ETLTable("t2"),
                                       ETLTemporaryTable("t3"))
    assert execute_sql("select id1, id2 from t3 order by id1") == [(4, 5),
                                                                   (8, 8)]

예제 #6

0

파일 보기

파일: test_etl.py 프로젝트: umeshh/usaspending-api

def test_etl_table(make_a_table):

    expected_data_types = OrderedDict((
        ("id1", primatives.ColumnDefinition("id1", "integer", True)),
        ("id2", primatives.ColumnDefinition("id2", "integer", True)),
        ("name", primatives.ColumnDefinition("name", "text", False)),
        ("description",
         primatives.ColumnDefinition("description", "text", False)),
    ))

    # Happy path.
    table = ETLTable("totally_tubular_testing_table_for_two", "public")
    assert table.columns == ["id1", "id2", "name", "description"]
    assert dict(table.data_types) == expected_data_types
    assert table.update_overrides == {}
    assert table.insert_overrides == {}
    assert table.key_columns == [
        primatives.ColumnDefinition("id1", "integer", True),
        primatives.ColumnDefinition("id2", "integer", True),
    ]
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # No schema.
    table = ETLTable("totally_tubular_testing_table_for_two")
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # Happy overrides.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        key_overrides=["id1"],
        insert_overrides={"name": Literal("hello")},
        update_overrides={"description": Literal("o hi")},
    )
    assert table.columns == ["id1", "id2", "name", "description"]
    assert dict(table.data_types) == expected_data_types
    assert table.insert_overrides == {"name": Literal("hello")}
    assert table.update_overrides == {"description": Literal("o hi")}
    assert table.key_columns == [
        primatives.ColumnDefinition("id1", "integer", True)
    ]
    assert cc(table.object_representation
              ) == '"public"."totally_tubular_testing_table_for_two"'

    # Nonexistent overrides.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        key_overrides=["bogus"],
        insert_overrides={"bogus": SQL("now()")},
        update_overrides={"bogus": SQL("now()")},
    )
    with pytest.raises(RuntimeError):
        assert table.update_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.insert_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.key_columns is not None

    # Attempt to override primary keys.
    table = ETLTable(
        "totally_tubular_testing_table_for_two",
        "public",
        insert_overrides={"id1": SQL("now()")},
        update_overrides={"id1": SQL("now()")},
    )
    with pytest.raises(RuntimeError):
        assert table.update_overrides is not None
    with pytest.raises(RuntimeError):
        assert table.insert_overrides is not None

    # Everything for ETLTemporaryTable is the same as ETLTable just no schema.
    table = ETLTemporaryTable("totally_tubular_testing_table_for_two")
    assert cc(table.object_representation
              ) == '"totally_tubular_testing_table_for_two"'

예제 #7

0

파일 보기

    def _perform_load(self):
        """ Grab the Broker subaward table and use it to update ours. """

        broker_subaward = ETLTable("broker_subaward")
        subaward = ETLTable("subaward")

        remote_subaward = ETLDBLinkTable("subaward",
                                         settings.DATA_BROKER_DBLINK_NAME,
                                         broker_subaward.data_types)

        temp_broker_subaward = ETLTemporaryTable(
            "temp_load_subawards_broker_subaward")
        temp_new_or_updated = ETLTemporaryTable(
            "temp_load_subawards_new_or_updated")
        temp_subaward = ETLTemporaryTable("temp_load_subawards_subaward")

        if self.full_reload:
            logger.info(
                "--full-reload switch provided.  Emptying subaward tables.")
            self._execute_dml_sql("delete from broker_subaward",
                                  "Empty broker_subaward table")
            self._execute_dml_sql("delete from subaward",
                                  "Empty subaward table")

        self._execute_function_and_log(
            operations.stage_table,
            "Copy Broker subaward table",
            source=remote_subaward,
            destination=broker_subaward,
            staging=temp_broker_subaward,
        )

        # To help performance.
        self._execute_dml_sql(
            "create index idx_temp_load_subawards_broker_subaward on temp_load_subawards_broker_subaward(id)",
            "Create temporary index",
        )

        # Create a list of new or updated subawards that we can use to filter down
        # subsequent operations.
        self._execute_function_and_log(
            operations.identify_new_or_updated,
            "Identify new/updated rows",
            source=temp_broker_subaward,
            destination=broker_subaward,
            staging=temp_new_or_updated,
        )

        self._delete_update_insert_rows("broker_subawards",
                                        temp_broker_subaward, broker_subaward)

        # The Broker subaward table takes up a good bit of space so let's explicitly free
        # it up before continuing.
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_broker_subaward",
            "Drop staging table")

        self._execute_etl_dml_sql_directory_file("030_frame_out_subawards")
        self._execute_etl_dml_sql_directory_file(
            "040_enhance_with_awards_data")
        self._execute_etl_dml_sql_directory_file(
            "050_enhance_with_transaction_fpds_data.sql")
        self._execute_etl_dml_sql_directory_file(
            "051_enhance_with_transaction_fabs_data.sql")
        self._execute_etl_dml_sql_directory_file("060_enhance_with_cfda_data")
        self._execute_etl_dml_sql_directory_file(
            "070_enhance_with_awarding_agency")
        self._execute_etl_dml_sql_directory_file(
            "080_enhance_with_funding_agency")

        with Timer("Enhance with city and county information"):
            update_subaward_city_county("temp_load_subawards_subaward")

        self._execute_etl_dml_sql_directory_file(
            "110_enhance_with_pop_country")
        self._execute_etl_dml_sql_directory_file(
            "130_enhance_with_recipient_location_country")

        # Delete from broker_subaward because temp_subaward only has new/updated rows.
        self._execute_function_and_log(operations.delete_obsolete_rows,
                                       "Delete obsolete subawards",
                                       broker_subaward, subaward)

        # But update and insert from temp_subaward.
        self._execute_function_and_log(operations.update_changed_rows,
                                       "Update changed subawards",
                                       temp_subaward, subaward)
        self._execute_function_and_log(operations.insert_missing_rows,
                                       "Insert missing subawards",
                                       temp_subaward, subaward)

        self._execute_etl_dml_sql_directory_file("140_link_awards")
        self._execute_etl_dml_sql_directory_file("150_update_awards")

        # This is for unit tests.  Temporary tables go away on their own once the transaction/session
        # drops, but because unit tests run in a transaction, our temporary tables do not go away
        # between loads... which is problematic.
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_new_or_updated",
            "Drop new/updated temp table")
        self._execute_dml_sql(
            "drop table if exists temp_load_subawards_subaward",
            "Drop subaward temp table")