def test_get_batch_with_split_on_whole_table_gcs(
    spark_session, basic_spark_df_execution_engine
):
    # noinspection PyUnusedLocal
    def mocked_get_reader_function(*args, **kwargs):
        # noinspection PyUnusedLocal,PyShadowingNames
        def mocked_reader_function(*args, **kwargs):
            pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
            df = spark_session.createDataFrame(
                [
                    tuple(
                        None if isinstance(x, (float, int)) and np.isnan(x) else x
                        for x in record.tolist()
                    )
                    for record in pd_df.to_records(index=False)
                ],
                pd_df.columns.tolist(),
            )
            return df

        return mocked_reader_function

    spark_engine = basic_spark_df_execution_engine
    spark_engine._get_reader_fn = mocked_get_reader_function

    test_sparkdf = spark_engine.get_batch_data(
        GCSBatchSpec(
            path="gcs://bucket/test/test.csv",
            reader_method="csv",
            reader_options={"header": True},
            splitter_method="_split_on_whole_table",
        )
    ).dataframe
    assert test_sparkdf.count() == 4
    assert len(test_sparkdf.columns) == 2
    def build_batch_spec(self,
                         batch_definition: BatchDefinition) -> GCSBatchSpec:
        """
        Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function.

        Args:
            batch_definition (BatchDefinition): to be used to build batch_spec

        Returns:
            BatchSpec built from batch_definition
        """
        batch_spec: PathBatchSpec = super().build_batch_spec(
            batch_definition=batch_definition)
        return GCSBatchSpec(batch_spec)
示例#3
0
def gcs_batch_spec() -> GCSBatchSpec:
    bucket = "test_bucket"
    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
        "alpha-1.csv",
        "alpha-2.csv",
    ]
    path = keys[0]
    full_path = os.path.join("gs://", bucket, path)

    batch_spec = GCSBatchSpec(
        path=full_path,
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    )
    return batch_spec