def test_get_compute_domain_with_multicolumn_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None], "c": [1, 2, 2, 3], "d": [2, 7, 9, 2] }) expected_identity = df.drop(columns=["d"]) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="multicolumn") assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "columns": ["a", "b", "c"] }, "Accessor kwargs have been modified" # Trying same test with enum form of table domain - should work the same way data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"columns": ["a", "b", "c"]}, domain_type="identity") assert data.equals( expected_identity), "Data does not match after getting compute domain" assert compute_kwargs == { "columns": ["a", "b", "c"] }, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_compute_domain_with_unmeetable_row_condition(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) expected_df = df[df["b"] > 24] # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column": "a", "row_condition": "b > 24", "condition_parser": "pandas", }, domain_type="column", ) # Ensuring data has been properly queried assert data["b"].equals( expected_df["b"]), "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == { "column": "a" }, "Accessor kwargs have been modified"
def test_get_compute_domain_with_column_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) expected_identity = df.drop(columns=["b"]) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN) assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column": "a" }, "Accessor kwargs have been modified" # Doing this using identity domain should yield different results data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.IDENTITY) assert data.equals( expected_identity), "Data does not match after getting compute domain" assert compute_kwargs == { "column": "a" }, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_dataframe_property_given_loaded_batch(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4]}) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Ensuring Data not distorted assert engine.dataframe.equals(df)
def test_get_compute_domain_with_column_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN) assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column": "a" }, "Accessor kwargs have been modified"
def test_get_compute_domain_with_no_domain_kwargs(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={}, domain_type="identity") assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified" # Trying same test with enum form of table domain - should work the same way data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={}, domain_type=MetricDomainTypes.TABLE) assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_domain_records_with_column_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records(domain_kwargs={ "column": "a", "row_condition": "b<5", "condition_parser": "pandas", }) expected_column_df = df.iloc[:3] assert data.equals( expected_column_df ), "Data does not match after getting full access compute domain"
def test_get_compute_domain_with_column_pair_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4] }) expected_column_pair_df = df.drop(columns=["c"]) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="column_pair") assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column_A": "a", "column_B": "b", }, "Accessor kwargs have been modified"
def test_get_domain_records_with_multicolumn_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records( domain_kwargs={ "column_list": ["a", "c"], "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "all_values_are_missing", }) data = data.astype(int) expected_multicolumn_df = pd.DataFrame( { "a": [2, 3, 4, 5], "b": [3, 4, 5, 7], "c": [2, 3, 4, 6] }, index=[1, 2, 3, 5]) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_list": ["b", "c"], "row_condition": "a<5", "condition_parser": "pandas", "ignore_row_if": "any_value_is_missing", }) data = data.astype(int) expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4] }, index=[0, 1, 2, 3]) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain" engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records(domain_kwargs={ "column_list": ["b", "c"], "ignore_row_if": "never", }) expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }, index=[0, 1, 2, 3, 4, 5], ) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain"
def test_get_domain_records_with_column_pair_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, 5, 6], "b": [2, 3, 4, 5, None, 6], "c": [1, 2, 3, 4, 5, None], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records( domain_kwargs={ "column_A": "a", "column_B": "b", "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "both_values_are_missing", }) expected_column_pair_df = pd.DataFrame( { "a": [2, 3, 4, 6], "b": [3.0, 4.0, 5.0, 6.0], "c": [2.0, 3.0, 4.0, None], }, index=[1, 2, 3, 5], ) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "either_value_is_missing", }) data = data.astype(int) expected_column_pair_df = pd.DataFrame( { "a": [2, 3, 4], "b": [3, 4, 5], "c": [2, 3, 4] }, index=[1, 2, 3]) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": "a<6", "condition_parser": "pandas", "ignore_row_if": "neither", }) expected_column_pair_df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2.0, 3.0, 4.0, 5.0, None], "c": [1.0, 2.0, 3.0, 4.0, 5.0], }) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain"