def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 5 # Limit should also work dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 2
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_parquet_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 5 # Limit should also work batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 2
def test_standalone_spark_csv_datasource(test_folder_connection_path_csv, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "SparkParquet", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) assert datasource.get_available_data_asset_names( )["subdir_reader"]["names"] == [("test", "file")] batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(test_folder_connection_path_csv, "test.csv"), "reader_options": { "header": True }, }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()["col_1"] == "1"
def test_spark_datasource_processes_dataset_options( test_folder_connection_path_csv, test_backends, empty_data_context): context: DataContext = empty_data_context if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", data_asset_name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator( batch, ExpectationSuite(expectation_suite_name="foo", data_context=context)) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path): datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": set(['test']) } dataset = datasource.get_batch('test', header=True) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_invalid_reader_sparkdf_datasource(tmp_path_factory): pyspark_skip = pytest.importorskip("pyspark") basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource('mysparksource', base_directory=basepath) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="blarg") assert "Unknown reader method: blarg" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="excel") assert "Unsupported reader: excel" in exc.value.message dataset = datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), }, reader_method="csv", reader_options={'header': True}) assert dataset.spark_df.head()["a"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory): pytest.importorskip("pyspark") basepath = str( tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource('mysparksource', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": basepath } }) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "blarg" }) assert "Unknown reader method: blarg" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "excel" }) assert "Unknown reader: excel" in exc.value.message batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "csv", "reader_options": { 'header': True } }) assert batch.data.head()["a"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) datasource = SparkDFDatasource( "mysparksource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": basepath, } }, ) with open( os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w" ) as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") } ) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized" ), "reader_method": "blarg", } ) assert "Unknown reader method: blarg" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized" ), "reader_method": "excel", } ) assert "Unknown reader: excel" in exc.value.message batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "csv", "reader_options": {"header": True}, } ) assert batch.data.head()["a"] == "1"
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path): datasource = SparkDFDatasource('PandasCSV', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet, ge_context: DataContext) -> str: from great_expectations.datasource import SparkDFDatasource path = str(dataset._filepath.parent) if path.startswith("./"): path = path[2:] configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } }) configuration["class_name"] = "SparkDFDatasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {0:s}".format(errors)) ge_context.add_datasource(name=datasource_name, **configuration) return datasource_name
def _add_spark_datasource(context): path = click.prompt( msg_prompt_filesys_enter_base_path, # default='/data/', type=click.Path(exists=True, file_okay=False, dir_okay=True, readable=True), show_default=True) if path.startswith("./"): path = path[2:] if path.endswith("/"): path = path[:-1] default_data_source_name = os.path.basename(path) + "__dir" data_source_name = click.prompt(msg_prompt_datasource_name, default=default_data_source_name, show_default=True) configuration = SparkDFDatasource.build_configuration( base_directory=os.path.join("..", path)) context.add_datasource(name=data_source_name, class_name='SparkDFDatasource', **configuration) return data_source_name
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv') }, reader_options={"header": True}) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_spark_config(): source = SparkDFDatasource() conf = source.spark.sparkContext.getConf().getAll() # Without specifying any spark_config values we get defaults assert ("spark.app.name", "pyspark-shell") in conf source = SparkDFDatasource(spark_config={ "spark.app.name": "great_expectations", "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "128m" }) # Test that our values were set conf = source.spark.sparkContext.getConf().getAll() assert ("spark.app.name", "great_expectations") in conf assert ("spark.sql.catalogImplementation", "hive") in conf assert ("spark.executor.memory", "128m") in conf
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', generators={"subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv'), "reader_options": {"header": True} }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == '1'
def test_spark_config(test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") source = SparkDFDatasource() conf = source.spark.sparkContext.getConf().getAll() # Without specifying any spark_config values we get defaults assert ("spark.app.name", "pyspark-shell") in conf source = SparkDFDatasource(spark_config={ "spark.app.name": "great_expectations", "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "128m" }) # Test that our values were set conf = source.spark.sparkContext.getConf().getAll() assert ("spark.app.name", "great_expectations") in conf assert ("spark.sql.catalogImplementation", "hive") in conf assert ("spark.executor.memory", "128m") in conf
def test_spark_config_datasource(spark_session_v012): name: str = "great_expectations-ds-config" spark_config: Dict[str, str] = { "spark.app.name": name, "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "768m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } source: SparkDFDatasource = SparkDFDatasource(spark_config=spark_config) spark_session: SparkSession = source.spark # noinspection PyProtectedMember sc_stopped: bool = spark_session.sparkContext._jsc.sc().isStopped() assert not sc_stopped # Test that our values were set conf: List[tuple] = source.spark.sparkContext.getConf().getAll() assert ("spark.app.name", name) in conf assert ("spark.sql.catalogImplementation", "hive") in conf assert ("spark.executor.memory", "768m") in conf
def _add_spark_datasource( context, passthrough_generator_only=True, prompt_for_datasource_name=True ): toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "spark"}, success=True, ) if not _verify_pyspark_dependent_modules(): return None if passthrough_generator_only: datasource_name = "files_spark_datasource" # configuration = SparkDFDatasource.build_configuration(batch_kwargs_generators={ # "default": { # "class_name": "PassthroughGenerator", # } # } # ) configuration = SparkDFDatasource.build_configuration() else: path = click.prompt( msg_prompt_filesys_enter_base_path, type=click.Path(exists=True, file_okay=False), ).strip() if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path datasource_name = os.path.basename(basenamepath) + "__dir" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } } ) configuration["class_name"] = "SparkDFDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml: {:s} """.format( datasource_name, textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), " "), ) ) toolkit.confirm_proceed_or_exit() context.add_datasource(name=datasource_name, **configuration) return datasource_name
def basic_sparkdf_datasource(): return SparkDFDatasource("basic_sparkdf_datasource")
def basic_sparkdf_datasource(test_backends): if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") return SparkDFDatasource("basic_sparkdf_datasource")