def test_get_batch_with_split_on_divided_integer_and_sample_on_list(test_df): split_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_df, splitter_method="_split_on_divided_integer", splitter_kwargs={ "column_name": "id", "divisor": 10, "batch_identifiers": { "id": 5 }, }, sampling_method="_sample_using_mod", sampling_kwargs={ "column_name": "id", "mod": 5, "value": 4, }, )) assert split_df.dataframe.shape == (2, 10) assert split_df.dataframe.id.min() == 54 assert split_df.dataframe.id.max() == 59
def test_get_compute_domain_with_no_domain_kwargs(): engine = PandasExecutionEngine() df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={}, domain_type="identity") assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified" # Trying same test with enum form of table domain - should work the same way data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={}, domain_type=MetricDomainTypes.TABLE) assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_reader_fn(): engine = PandasExecutionEngine() # Testing that can recognize basic excel file fn = engine._get_reader_fn(path="myfile.xlsx") assert "<function read_excel" in str(fn) # Testing that can recognize basic sas7bdat file fn_read_sas7bdat = engine._get_reader_fn(path="myfile.sas7bdat") assert "<function read_sas" in str(fn_read_sas7bdat) # Testing that can recognize basic SAS xpt file fn_read_xpt = engine._get_reader_fn(path="myfile.xpt") assert "<function read_sas" in str(fn_read_xpt) # Ensuring that other way around works as well - reader_method should always override path fn_new = engine._get_reader_fn(reader_method="read_csv") assert "<function" in str(fn_new)
def test_get_compute_domain_with_column_pair_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4] }) expected_identity = df.drop(columns=["c"]) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="column_pair") assert data.equals(df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column_A": "a", "column_B": "b", }, "Accessor kwargs have been modified" # Trying same test with enum form of table domain - should work the same way data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="identity") assert data.equals( expected_identity), "Data does not match after getting compute domain" assert compute_kwargs == { "column_A": "a", "column_B": "b", }, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_batch_with_split_on_whole_table_s3(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) path = "path/A-100.csv" full_path = f"s3a://{os.path.join(bucket, path)}" test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (2, 2) # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() execution_engine_no_s3._s3 = None with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", ))
def ge_validator_pandas() -> Validator: validator = Validator(execution_engine=PandasExecutionEngine()) return validator
def test_sample_using_random(test_df): random.seed(1) sampled_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec(batch_data=test_df, sampling_method="_sample_using_random")) assert sampled_df.dataframe.shape == (13, 10)
def test_get_batch_with_split_on_whole_table(test_df): split_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec(batch_data=test_df, splitter_method="_split_on_whole_table")) assert split_df.dataframe.shape == (120, 10)
def test_get_batch_with_split_on_whole_table_s3( batch_with_split_on_whole_table_s3, test_df_small): df = PandasExecutionEngine().get_batch_data( batch_spec=batch_with_split_on_whole_table_s3) assert df.dataframe.shape == test_df_small.shape
def test_get_domain_records_with_multicolumn_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records( domain_kwargs={ "column_list": ["a", "c"], "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "all_values_are_missing", }) data = data.astype(int) expected_multicolumn_df = pd.DataFrame( { "a": [2, 3, 4, 5], "b": [3, 4, 5, 7], "c": [2, 3, 4, 6] }, index=[1, 2, 3, 5]) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_list": ["b", "c"], "row_condition": "a<5", "condition_parser": "pandas", "ignore_row_if": "any_value_is_missing", }) data = data.astype(int) expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4] }, index=[0, 1, 2, 3]) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain" engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records(domain_kwargs={ "column_list": ["b", "c"], "ignore_row_if": "never", }) expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }, index=[0, 1, 2, 3, 4, 5], ) assert data.equals( expected_multicolumn_df ), "Data does not match after getting full access compute domain"
def test_get_domain_records_with_column_pair_domain(): engine = PandasExecutionEngine() df = pd.DataFrame({ "a": [1, 2, 3, 4, 5, 6], "b": [2, 3, 4, 5, None, 6], "c": [1, 2, 3, 4, 5, None], }) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records( domain_kwargs={ "column_A": "a", "column_B": "b", "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "both_values_are_missing", }) expected_column_pair_df = pd.DataFrame( { "a": [2, 3, 4, 6], "b": [3.0, 4.0, 5.0, 6.0], "c": [2.0, 3.0, 4.0, None], }, index=[1, 2, 3, 5], ) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": "b>2", "condition_parser": "pandas", "ignore_row_if": "either_value_is_missing", }) data = data.astype(int) expected_column_pair_df = pd.DataFrame( { "a": [2, 3, 4], "b": [3, 4, 5], "c": [2, 3, 4] }, index=[1, 2, 3]) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain" data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": "a<6", "condition_parser": "pandas", "ignore_row_if": "neither", }) expected_column_pair_df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2.0, 3.0, 4.0, 5.0, None], "c": [1.0, 2.0, 3.0, 4.0, 5.0], }) assert data.equals( expected_column_pair_df ), "Data does not match after getting full access compute domain"
def test_get_batch_with_gcs_misconfigured(gcs_batch_spec): # gcs_batchspec point to data that the ExecutionEngine does not have access to execution_engine_no_gcs = PandasExecutionEngine() # Raises error if batch_spec causes ExecutionEngine error with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_gcs.get_batch_data(batch_spec=gcs_batch_spec)