def test_reader_fn_parameters(spark_session, basic_spark_df_execution_engine, tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_csv")) create_files_in_directory( directory=base_directory, file_name_list=[ "test-A.csv", ], ) test_df_small_csv_path = base_directory + "/test-A.csv" engine = basic_spark_df_execution_engine fn = engine._get_reader_fn(reader=spark_session.read, path=test_df_small_csv_path) assert "<bound method DataFrameReader.csv" in str(fn) test_sparkdf_with_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=test_df_small_csv_path, data_asset_name="DATA_ASSET", reader_options={"header": True}, )).dataframe assert test_sparkdf_with_header_param.head() == Row(x="1", y="2") test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec(path=test_df_small_csv_path, data_asset_name="DATA_ASSET")).dataframe assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y")
def test_get_batch_with_split_on_whole_table_filesystem( test_folder_connection_path_csv, ): test_df = PandasExecutionEngine().get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_csv, "test.csv"), reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (5, 2)
def test_get_batch_with_split_on_whole_table_filesystem( test_folder_connection_path_csv, spark_session, basic_spark_df_execution_engine): # reader_method not configured because spark will configure own reader by default test_sparkdf = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_csv, "test.csv"), splitter_method="_split_on_whole_table", )).dataframe assert test_sparkdf.count() == 6 assert len(test_sparkdf.columns) == 2
def test_get_batch_empty_splitter_parquet( test_folder_connection_path_parquet, spark_session, basic_spark_df_execution_engine ): # Note: reader method and reader_options are not needed, because # SparkDFExecutionEngine automatically determines the file type as well as the schema of the Parquet file. test_sparkdf = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_parquet, "test.parquet"), splitter_method=None, ) ).dataframe assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def test_get_batch_empty_splitter(test_folder_connection_path_csv, spark_session, basic_spark_df_execution_engine): # reader_method not configured because spark will configure own reader by default # reader_options are needed to specify the fact that the first line of test file is the header test_sparkdf = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_csv, "test.csv"), reader_options={"header": True}, splitter_method=None, )).dataframe assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def build_batch_spec(self, batch_definition: BatchDefinition) -> PathBatchSpec: """ Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function. Args: batch_definition (BatchDefinition): to be used to build batch_spec Returns: BatchSpec built from batch_definition """ batch_spec: BatchSpec = super().build_batch_spec( batch_definition=batch_definition) return PathBatchSpec(batch_spec)
def test_get_batch_empty_splitter_tsv( test_folder_connection_path_tsv, basic_spark_df_execution_engine ): # reader_method not configured because spark will configure own reader by default # reader_options are needed to specify the fact that the first line of test file is the header # reader_options are also needed to specify the separator (otherwise, comma will be used as the default separator) test_sparkdf = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_tsv, "test.tsv"), reader_options={"header": True, "sep": "\t"}, splitter_method=None, ) ).dataframe assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def build_batch_spec( self, batch_definition: BatchDefinition, runtime_parameters: dict, ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]: self._validate_runtime_parameters( runtime_parameters=runtime_parameters) batch_spec: BatchSpec = super().build_batch_spec( batch_definition=batch_definition) if runtime_parameters.get("batch_data") is not None: batch_spec["batch_data"] = runtime_parameters.get("batch_data") return RuntimeDataBatchSpec(batch_spec) elif runtime_parameters.get("query"): batch_spec["query"] = runtime_parameters.get("query") return RuntimeQueryBatchSpec(batch_spec) elif runtime_parameters.get("path"): path = runtime_parameters.get("path") batch_spec["path"] = path parsed_url = urlparse(path) if "s3" in parsed_url.scheme: return S3BatchSpec(batch_spec) else: return PathBatchSpec(batch_spec)
def build_batch_spec( self, batch_definition: BatchDefinition, runtime_parameters: dict, ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]: self._validate_runtime_parameters( runtime_parameters=runtime_parameters) batch_spec: BatchSpec = super().build_batch_spec( batch_definition=batch_definition) if "batch_data" in runtime_parameters: batch_spec["batch_data"] = runtime_parameters.get("batch_data") return RuntimeDataBatchSpec(batch_spec) elif "query" in runtime_parameters: batch_spec["query"] = runtime_parameters.get("query") return RuntimeQueryBatchSpec(batch_spec) elif "path" in runtime_parameters: path: str = runtime_parameters["path"] batch_spec["path"] = path if "s3" in path: return S3BatchSpec(batch_spec) elif "blob.core.windows.net" in path: return AzureBatchSpec(batch_spec) else: return PathBatchSpec(batch_spec)