def test_get_batch_with_no_s3_configured(batch_with_split_on_whole_table_s3): # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() execution_engine_no_s3._s3 = None with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data( batch_spec=batch_with_split_on_whole_table_s3)
def test_get_batch_with_no_azure_configured(azure_batch_spec): # if Azure BlobServiceClient was not configured execution_engine_no_azure = PandasExecutionEngine() execution_engine_no_azure._azure = None # Raises error due the connection object not being set with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_azure.get_batch_data(batch_spec=azure_batch_spec)
def test_get_batch_with_no_s3_configured(): batch_spec = S3BatchSpec( path="s3a://i_dont_exist", reader_method="read_csv", splitter_method="_split_on_whole_table", ) # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data(batch_spec=batch_spec)
def test_get_batch_with_split_on_whole_table_s3(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) path = "path/A-100.csv" full_path = f"s3a://{os.path.join(bucket, path)}" test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (2, 2) # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() execution_engine_no_s3._s3 = None with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", ))
def test_get_batch_with_gcs_misconfigured(gcs_batch_spec): # gcs_batchspec point to data that the ExecutionEngine does not have access to execution_engine_no_gcs = PandasExecutionEngine() # Raises error if batch_spec causes ExecutionEngine error with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_gcs.get_batch_data(batch_spec=gcs_batch_spec)