def __init__( self, name: str, datasource_name: str, bucket: str, execution_engine: Optional[ExecutionEngine] = None, default_regex: Optional[dict] = None, sorters: Optional[list] = None, prefix: str = "", delimiter: str = "/", max_keys: int = 1000, boto3_options: Optional[dict] = None, batch_spec_passthrough: Optional[dict] = None, ) -> None: """ InferredAssetS3DataConnector for connecting to S3. Args: name (str): required name for data_connector datasource_name (str): required name for datasource bucket (str): bucket for S3 execution_engine (ExecutionEngine): optional reference to ExecutionEngine default_regex (dict): optional regex configuration for filtering data_references sorters (list): optional list of sorters for sorting data_references prefix (str): S3 prefix delimiter (str): S3 delimiter max_keys (int): S3 max_keys (default is 1000) boto3_options (dict): optional boto3 options batch_spec_passthrough (dict): dictionary with keys that will be added directly to batch_spec """ logger.debug(f'Constructing InferredAssetS3DataConnector "{name}".') super().__init__( name=name, datasource_name=datasource_name, execution_engine=execution_engine, default_regex=default_regex, sorters=sorters, batch_spec_passthrough=batch_spec_passthrough, ) self._bucket = bucket self._prefix = ConfiguredAssetS3DataConnector.sanitize_prefix_for_s3( prefix) self._delimiter = delimiter self._max_keys = max_keys if boto3_options is None: boto3_options = {} try: self._s3 = boto3.client("s3", **boto3_options) except (TypeError, AttributeError): raise ImportError( "Unable to load boto3 (it is required for InferredAssetS3DataConnector)." )
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( test_s3_files, test_df_small ): bucket, _keys = test_s3_files expected_df = test_df_small execution_engine: ExecutionEngine = PandasExecutionEngine() my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", bucket=bucket, execution_engine=execution_engine, prefix="", assets={"alpha": {}}, default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, ) batch_def: BatchDefinition = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=1), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) test_df = execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def) ) assert test_df.dataframe.shape == expected_df.shape # if key does not exist batch_def_no_key = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=9), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def_no_key ) )
def check_sameness(prefix, expected_output): s3_sanitized = ConfiguredAssetS3DataConnector.sanitize_prefix_for_s3(prefix) file_system_sanitized = FilePathDataConnector.sanitize_prefix(prefix) if os.sep == "\\": # Fix to ensure tests work on Windows file_system_sanitized = file_system_sanitized.replace("\\", "/") assert file_system_sanitized == expected_output, ( f"Expected output does not match original sanitization behavior, got " f"{file_system_sanitized} instead of {expected_output}" ) assert ( s3_sanitized == expected_output == file_system_sanitized ), f'S3 sanitized result is incorrect, "{s3_sanitized} instead of {expected_output}'
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( ): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) path = "path/A-100.csv" full_path = f"s3a://{os.path.join(bucket, path)}" my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, bucket=bucket, prefix="", assets={"alpha": {}}, ) test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (2, 2)
def test_basic_instantiation(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={"pattern": "alpha-(.*)\\.csv", "group_names": ["index"],}, bucket=bucket, prefix="", assets={"alpha": {}}, ) assert my_data_connector.self_check() == { "class_name": "ConfiguredAssetS3DataConnector", "data_asset_count": 1, "example_data_asset_names": ["alpha",], "data_assets": { "alpha": { "example_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, "example_data_reference": {}, } # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 3 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ) ) )