def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_parquet_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 5 # Limit should also work batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 2
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 5 # Limit should also work dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 2
def test_spark_datasource_processes_dataset_options( test_folder_connection_path_csv, test_backends, empty_data_context): context: DataContext = empty_data_context if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", data_asset_name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator( batch, ExpectationSuite(expectation_suite_name="foo", data_context=context)) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path_csv, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "SparkParquet", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) assert datasource.get_available_data_asset_names( )["subdir_reader"]["names"] == [("test", "file")] batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(test_folder_connection_path_csv, "test.csv"), "reader_options": { "header": True }, }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()["col_1"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory): pyspark_skip = pytest.importorskip("pyspark") basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource('mysparksource', base_directory=basepath) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="blarg") assert "Unknown reader method: blarg" in exc.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="excel") assert "Unsupported reader: excel" in exc.message dataset = datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="csv", header=True) assert dataset.spark_df.head()["a"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory): pytest.importorskip("pyspark") basepath = str( tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource('mysparksource', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": basepath } }) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "blarg" }) assert "Unknown reader method: blarg" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "excel" }) assert "Unknown reader: excel" in exc.value.message batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "csv", "reader_options": { 'header': True } }) assert batch.data.head()["a"] == "1"
def test_standalone_spark_csv_datasource(test_folder_connection_path): datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": set(['test']) } dataset = datasource.get_batch('test', header=True) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_invalid_reader_sparkdf_datasource(tmp_path_factory, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource( "mysparksource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": basepath, } }, ) with open( os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w" ) as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") } ) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized" ), "reader_method": "blarg", } ) assert "Unknown reader method: blarg" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized" ), "reader_method": "excel", } ) assert "Unknown reader: excel" in exc.value.message batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "csv", "reader_options": {"header": True}, } ) assert batch.data.head()["a"] == "1"
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path): datasource = SparkDFDatasource('PandasCSV', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv') }, reader_options={"header": True}) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', generators={"subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv'), "reader_options": {"header": True} }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == '1'