Exemplo n.º 1
0
def test_get_compute_domain_with_column_pair(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    # Fetching data, compute_domain_kwargs, accessor_kwargs
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="column_pair")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert ("column_A" not in compute_kwargs.keys() and "column_B"
            not in compute_kwargs.keys()), "domain kwargs should be existent"
    assert accessor_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Accessor kwargs have been modified"

    # Building new engine so that values still found
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))
    data2, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="identity")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select([sa.column("a"), sa.column("b")]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data2)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 2
0
def test_get_compute_domain_with_column_domain(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    # Loading batch data
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN)

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column": "a"
    }, "Accessor kwargs have been modified"

    # Testing for identity
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    # Loading batch data
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.IDENTITY)

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select([sa.column("a")]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column": "a"
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 3
0
def test_get_compute_domain_with_unmeetable_row_condition(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "row_condition": 'col("b") > 24',
            "condition_parser": "great_expectations__experimental__",
        },
        domain_type="identity",
    )

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"
                   ]).select_from(engine.active_batch_data.selectable).where(
                       sa.column("b") > 24)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    # Ensuring compute kwargs have not been modified
    assert ("row_condition" in compute_kwargs.keys()
            ), "Row condition should be located within compute kwargs"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 4
0
def test_get_compute_domain_with_nonexistent_condition_parser(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    # Expect GreatExpectationsError because parser doesn't exist
    with pytest.raises(GreatExpectationsError) as e:
        data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
            domain_kwargs={
                "row_condition": "b > 24",
                "condition_parser": "nonexistent",
            },
            domain_type=MetricDomainTypes.TABLE,
        )
def test_get_compute_domain_with_multicolumn(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None],
            "c": [1, 2, 3, None]
        }),
        sa,
    )

    # Obtaining compute domain
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="multicolumn")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs is not None, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "columns": ["a", "b", "c"]
    }, "Accessor kwargs have been modified"

    # Checking for identity
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="identity")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select([sa.column("a"),
                   sa.column("b"),
                   sa.column("c")]).select_from(
                       engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "columns": ["a", "b", "c"]
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 6
0
def test_get_compute_domain_with_no_domain_kwargs(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }))

    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={}, domain_type="table")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 7
0
def test_resolve_metric_bundle_with_nonexistent_metric(sa):
    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }))

    desired_metric_1 = MetricConfiguration(
        metric_name="column_values.unique",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.does_not_exist",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )

    # Ensuring a metric provider error is raised if metric does not exist
    with pytest.raises(MetricProviderError) as e:
        res = engine.resolve_metrics(metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ))
        print(e)
Exemplo n.º 8
0
def test_sa_batch_aggregate_metrics(caplog, sa):
    import datetime

    engine = _build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }))

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )
    metrics = engine.resolve_metrics(metrics_to_resolve=(
        desired_metric_1,
        desired_metric_2,
        desired_metric_3,
        desired_metric_4,
    ))
    desired_metric_1 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_1},
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_2},
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_3},
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_4},
    )
    caplog.clear()
    caplog.set_level(logging.DEBUG, logger="great_expectations")
    start = datetime.datetime.now()
    res = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    end = datetime.datetime.now()
    print("t1")
    print(end - start)
    assert res[desired_metric_1.id] == 3
    assert res[desired_metric_2.id] == 1
    assert res[desired_metric_3.id] == 4
    assert res[desired_metric_4.id] == 4

    # Check that all four of these metrics were computed on a single domain
    found_message = False
    for record in caplog.records:
        if (record.message ==
                "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()"
            ):
            found_message = True
    assert found_message