def test_get_compute_domain_with_multicolumn_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4],
        "b": [2, 3, 4, None],
        "c": [1, 2, 2, 3],
        "d": [2, 7, 9, 2]
    })
    expected_identity = df.drop(columns=["d"])

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="multicolumn")
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "columns": ["a", "b", "c"]
    }, "Accessor kwargs have been modified"

    # Trying same test with enum form of table domain - should work the same way
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="identity")
    assert data.equals(
        expected_identity), "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "columns": ["a", "b", "c"]
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 2
0
def test_get_compute_domain_with_unmeetable_row_condition():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
    expected_df = df[df["b"] > 24]

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column": "a",
            "row_condition": "b > 24",
            "condition_parser": "pandas",
        },
        domain_type="column",
    )
    # Ensuring data has been properly queried
    assert data["b"].equals(
        expected_df["b"]), "Data does not match after getting compute domain"

    # Ensuring compute kwargs have not been modified
    assert ("row_condition" in compute_kwargs.keys()
            ), "Row condition should be located within compute kwargs"
    assert accessor_kwargs == {
        "column": "a"
    }, "Accessor kwargs have been modified"
def test_get_compute_domain_with_column_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
    expected_identity = df.drop(columns=["b"])

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN)
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column": "a"
    }, "Accessor kwargs have been modified"

    # Doing this using identity domain should yield different results
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.IDENTITY)

    assert data.equals(
        expected_identity), "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column": "a"
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_dataframe_property_given_loaded_batch():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4]})

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    # Ensuring Data not distorted
    assert engine.dataframe.equals(df)
Exemplo n.º 5
0
def test_get_compute_domain_with_column_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN)
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column": "a"
    }, "Accessor kwargs have been modified"
def test_get_compute_domain_with_no_domain_kwargs():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={}, domain_type="identity")
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"

    # Trying same test with enum form of table domain - should work the same way
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={}, domain_type=MetricDomainTypes.TABLE)
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Exemplo n.º 7
0
def test_get_domain_records_with_column_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(domain_kwargs={
        "column": "a",
        "row_condition": "b<5",
        "condition_parser": "pandas",
    })

    expected_column_df = df.iloc[:3]

    assert data.equals(
        expected_column_df
    ), "Data does not match after getting full access compute domain"
Exemplo n.º 8
0
def test_get_compute_domain_with_column_pair_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4],
        "b": [2, 3, 4, 5],
        "c": [1, 2, 3, 4]
    })
    expected_column_pair_df = df.drop(columns=["c"])

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="column_pair")
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Accessor kwargs have been modified"
Exemplo n.º 9
0
def test_get_domain_records_with_multicolumn_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["a", "c"],
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "all_values_are_missing",
        })
    data = data.astype(int)

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [2, 3, 4, 5],
            "b": [3, 4, 5, 7],
            "c": [2, 3, 4, 6]
        },
        index=[1, 2, 3, 5])

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["b", "c"],
            "row_condition": "a<5",
            "condition_parser": "pandas",
            "ignore_row_if": "any_value_is_missing",
        })
    data = data.astype(int)

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, 5],
            "c": [1, 2, 3, 4]
        },
        index=[0, 1, 2, 3])

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"

    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(domain_kwargs={
        "column_list": ["b", "c"],
        "ignore_row_if": "never",
    })

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, None, 5],
            "b": [2, 3, 4, 5, 6, 7],
            "c": [1, 2, 3, 4, None, 6],
        },
        index=[0, 1, 2, 3, 4, 5],
    )

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"
Exemplo n.º 10
0
def test_get_domain_records_with_column_pair_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, None, 6],
        "c": [1, 2, 3, 4, 5, None],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b",
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "both_values_are_missing",
        })

    expected_column_pair_df = pd.DataFrame(
        {
            "a": [2, 3, 4, 6],
            "b": [3.0, 4.0, 5.0, 6.0],
            "c": [2.0, 3.0, 4.0, None],
        },
        index=[1, 2, 3, 5],
    )
    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "either_value_is_missing",
        })
    data = data.astype(int)

    expected_column_pair_df = pd.DataFrame(
        {
            "a": [2, 3, 4],
            "b": [3, 4, 5],
            "c": [2, 3, 4]
        }, index=[1, 2, 3])

    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": "a<6",
            "condition_parser": "pandas",
            "ignore_row_if": "neither",
        })

    expected_column_pair_df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2.0, 3.0, 4.0, 5.0, None],
        "c": [1.0, 2.0, 3.0, 4.0, 5.0],
    })

    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"