def test_get_batch_with_split_on_whole_table_gcs( spark_session, basic_spark_df_execution_engine ): # noinspection PyUnusedLocal def mocked_get_reader_function(*args, **kwargs): # noinspection PyUnusedLocal,PyShadowingNames def mocked_reader_function(*args, **kwargs): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) return df return mocked_reader_function spark_engine = basic_spark_df_execution_engine spark_engine._get_reader_fn = mocked_get_reader_function test_sparkdf = spark_engine.get_batch_data( GCSBatchSpec( path="gcs://bucket/test/test.csv", reader_method="csv", reader_options={"header": True}, splitter_method="_split_on_whole_table", ) ).dataframe assert test_sparkdf.count() == 4 assert len(test_sparkdf.columns) == 2
def build_batch_spec(self, batch_definition: BatchDefinition) -> GCSBatchSpec: """ Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function. Args: batch_definition (BatchDefinition): to be used to build batch_spec Returns: BatchSpec built from batch_definition """ batch_spec: PathBatchSpec = super().build_batch_spec( batch_definition=batch_definition) return GCSBatchSpec(batch_spec)
def gcs_batch_spec() -> GCSBatchSpec: bucket = "test_bucket" keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", "alpha-1.csv", "alpha-2.csv", ] path = keys[0] full_path = os.path.join("gs://", bucket, path) batch_spec = GCSBatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", ) return batch_spec