def test_reader_fn_parameters(spark_session, basic_spark_df_execution_engine,
                              tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_csv"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test-A.csv",
        ],
    )
    test_df_small_csv_path = base_directory + "/test-A.csv"
    engine = basic_spark_df_execution_engine
    fn = engine._get_reader_fn(reader=spark_session.read,
                               path=test_df_small_csv_path)
    assert "<bound method DataFrameReader.csv" in str(fn)

    test_sparkdf_with_header_param = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=test_df_small_csv_path,
            data_asset_name="DATA_ASSET",
            reader_options={"header": True},
        )).dataframe
    assert test_sparkdf_with_header_param.head() == Row(x="1", y="2")

    test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(path=test_df_small_csv_path,
                      data_asset_name="DATA_ASSET")).dataframe
    assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y")
def test_get_batch_with_split_on_whole_table_filesystem(
    test_folder_connection_path_csv, ):
    test_df = PandasExecutionEngine().get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_csv, "test.csv"),
            reader_method="read_csv",
            splitter_method="_split_on_whole_table",
        ))
    assert test_df.dataframe.shape == (5, 2)
def test_get_batch_with_split_on_whole_table_filesystem(
        test_folder_connection_path_csv, spark_session,
        basic_spark_df_execution_engine):
    # reader_method not configured because spark will configure own reader by default
    test_sparkdf = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_csv, "test.csv"),
            splitter_method="_split_on_whole_table",
        )).dataframe
    assert test_sparkdf.count() == 6
    assert len(test_sparkdf.columns) == 2
예제 #4
0
def test_get_batch_empty_splitter_parquet(
    test_folder_connection_path_parquet, spark_session, basic_spark_df_execution_engine
):
    # Note: reader method and reader_options are not needed, because
    # SparkDFExecutionEngine automatically determines the file type as well as the schema of the Parquet file.
    test_sparkdf = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_parquet, "test.parquet"),
            splitter_method=None,
        )
    ).dataframe
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
def test_get_batch_empty_splitter(test_folder_connection_path_csv,
                                  spark_session,
                                  basic_spark_df_execution_engine):
    # reader_method not configured because spark will configure own reader by default
    # reader_options are needed to specify the fact that the first line of test file is the header
    test_sparkdf = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_csv, "test.csv"),
            reader_options={"header": True},
            splitter_method=None,
        )).dataframe
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
    def build_batch_spec(self,
                         batch_definition: BatchDefinition) -> PathBatchSpec:
        """
        Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function.

        Args:
            batch_definition (BatchDefinition): to be used to build batch_spec

        Returns:
            BatchSpec built from batch_definition
        """
        batch_spec: BatchSpec = super().build_batch_spec(
            batch_definition=batch_definition)
        return PathBatchSpec(batch_spec)
def test_get_batch_empty_splitter_tsv(
    test_folder_connection_path_tsv, basic_spark_df_execution_engine
):
    # reader_method not configured because spark will configure own reader by default
    # reader_options are needed to specify the fact that the first line of test file is the header
    # reader_options are also needed to specify the separator (otherwise, comma will be used as the default separator)
    test_sparkdf = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_tsv, "test.tsv"),
            reader_options={"header": True, "sep": "\t"},
            splitter_method=None,
        )
    ).dataframe
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
예제 #8
0
 def build_batch_spec(
     self,
     batch_definition: BatchDefinition,
     runtime_parameters: dict,
 ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]:
     self._validate_runtime_parameters(
         runtime_parameters=runtime_parameters)
     batch_spec: BatchSpec = super().build_batch_spec(
         batch_definition=batch_definition)
     if runtime_parameters.get("batch_data") is not None:
         batch_spec["batch_data"] = runtime_parameters.get("batch_data")
         return RuntimeDataBatchSpec(batch_spec)
     elif runtime_parameters.get("query"):
         batch_spec["query"] = runtime_parameters.get("query")
         return RuntimeQueryBatchSpec(batch_spec)
     elif runtime_parameters.get("path"):
         path = runtime_parameters.get("path")
         batch_spec["path"] = path
         parsed_url = urlparse(path)
         if "s3" in parsed_url.scheme:
             return S3BatchSpec(batch_spec)
         else:
             return PathBatchSpec(batch_spec)
 def build_batch_spec(
     self,
     batch_definition: BatchDefinition,
     runtime_parameters: dict,
 ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]:
     self._validate_runtime_parameters(
         runtime_parameters=runtime_parameters)
     batch_spec: BatchSpec = super().build_batch_spec(
         batch_definition=batch_definition)
     if "batch_data" in runtime_parameters:
         batch_spec["batch_data"] = runtime_parameters.get("batch_data")
         return RuntimeDataBatchSpec(batch_spec)
     elif "query" in runtime_parameters:
         batch_spec["query"] = runtime_parameters.get("query")
         return RuntimeQueryBatchSpec(batch_spec)
     elif "path" in runtime_parameters:
         path: str = runtime_parameters["path"]
         batch_spec["path"] = path
         if "s3" in path:
             return S3BatchSpec(batch_spec)
         elif "blob.core.windows.net" in path:
             return AzureBatchSpec(batch_spec)
         else:
             return PathBatchSpec(batch_spec)