Пример #1
0
def test_sqlalchemy_source_limit(sqlitedb_engine):
    df1 = pd.DataFrame({
        "col_1": [1, 2, 3, 4, 5],
        "col_2": ["a", "b", "c", "d", "e"]
    })
    df2 = pd.DataFrame({
        "col_1": [0, 1, 2, 3, 4],
        "col_2": ["b", "c", "d", "e", "f"]
    })
    df1.to_sql("table_1", con=sqlitedb_engine, index=True)
    df2.to_sql("table_2", con=sqlitedb_engine, index=True, schema="main")
    datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine)
    limited_batch = datasource.get_batch({
        "table": "table_1",
        "limit": 1,
        "offset": 2
    })
    assert isinstance(limited_batch, Batch)
    limited_dataset = Validator(
        limited_batch,
        expectation_suite=ExpectationSuite("test"),
        expectation_engine=SqlAlchemyDataset,
    ).get_dataset()
    assert limited_dataset._table.name.startswith(
        "ge_tmp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)["col_1"][0] == 3  # offset should have been applied
Пример #2
0
def test_sqlalchemy_source_limit(sqlitedb_engine):
    df1 = pd.DataFrame({
        'col_1': [1, 2, 3, 4, 5],
        'col_2': ['a', 'b', 'c', 'd', 'e']
    })
    df2 = pd.DataFrame({
        'col_1': [0, 1, 2, 3, 4],
        'col_2': ['b', 'c', 'd', 'e', 'f']
    })
    df1.to_sql('table_1', con=sqlitedb_engine, index=True)
    df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main')
    datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine)
    limited_batch = datasource.get_batch({
        "table": "table_1",
        "limit": 1,
        "offset": 2
    })
    assert isinstance(limited_batch, Batch)
    limited_dataset = Validator(
        limited_batch,
        expectation_suite=ExpectationSuite("test"),
        expectation_engine=SqlAlchemyDataset).get_dataset()
    assert limited_dataset._table.name.startswith(
        "ge_tmp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)['col_1'][0] == 3  # offset should have been applied
def test_to_make_sure_splitter_and_sampler_methods_are_optional(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "batch_identifiers": {},
                "sampling_method": "_sample_using_mod",
                "sampling_kwargs": {
                    "column_name": "id",
                    "mod": 10,
                    "value": 8,
                },
            }
        )
    )
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 12

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "batch_identifiers": {},
            }
        )
    )
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 120

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "batch_identifiers": {},
                "splitter_method": "_split_on_whole_table",
                "splitter_kwargs": {},
            }
        )
    )

    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 120
def test_sampling_method__limit(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "batch_identifiers": {},
                "splitter_method": "_split_on_whole_table",
                "splitter_kwargs": {},
                "sampling_method": "_sample_using_limit",
                "sampling_kwargs": {"n": 20},
            }
        )
    )

    batch = Batch(data=batch_data)

    validator = Validator(execution_engine, batches=[batch])
    assert len(validator.head(fetch_all=True)) == 20

    assert not validator.expect_column_values_to_be_in_set(
        "date", value_set=["2020-01-02"]
    ).success
Пример #5
0
def test_instantiation_via_url_and_retrieve_data_with_other_dialect(sa):
    """Ensure that we can still retrieve data when the dialect is not recognized."""

    # 1. Create engine with sqlite db
    db_file = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )
    my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file)
    assert my_execution_engine.connection_string is None
    assert my_execution_engine.credentials is None
    assert my_execution_engine.url[
        -36:] == "test_cases_for_sql_data_connector.db"

    # 2. Change dialect to one not listed in GESqlDialect
    my_execution_engine.engine.dialect.name = "other_dialect"

    # 3. Get data
    num_rows_in_sample: int = 10
    batch_data, _ = my_execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            table_name="table_partitioned_by_date_column__A",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": num_rows_in_sample},
        ))

    # 4. Assert dialect and data are as expected

    assert batch_data.dialect == GESqlDialect.OTHER

    my_execution_engine.load_batch_data("__", batch_data)
    validator = Validator(my_execution_engine)
    assert len(validator.head(fetch_all=True)) == num_rows_in_sample
def test_sampling_method__limit(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "partition_definition": {},
                "splitter_method": "_split_on_whole_table",
                "splitter_kwargs": {},
                "sampling_method": "_sample_using_limit",
                "sampling_kwargs": {"n": 20},
            }
        )
    )
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 20

    assert (
        validator.expect_column_values_to_be_in_set(
            "date", value_set=["2020-01-02"]
        ).success
        == False
    )
Пример #7
0
def test_sampling_method__a_list(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "splitter_method": "_split_on_whole_table",
            "splitter_kwargs": {},
            "sampling_method": "_sample_using_a_list",
            "sampling_kwargs": {
                "column_name": "id",
                "value_list": [10, 20, 30, 40],
            },
        }))
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 4