Exemplo n.º 1
0
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', generators={
    "subdir_reader": {
        "class_name": "SubdirReaderBatchKwargsGenerator",
        "base_directory": test_parquet_folder_connection_path
    }
}
)


    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 5

    # Limit should also work
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 2
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 5

    # Limit should also work
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 2
Exemplo n.º 3
0
def test_spark_datasource_processes_dataset_options(
        test_folder_connection_path_csv, test_backends, empty_data_context):
    context: DataContext = empty_data_context
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader",
                                                 data_asset_name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(
        batch,
        ExpectationSuite(expectation_suite_name="foo", data_context=context))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
Exemplo n.º 4
0
def test_standalone_spark_csv_datasource(test_folder_connection_path_csv,
                                         test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "SparkParquet",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    assert datasource.get_available_data_asset_names(
    )["subdir_reader"]["names"] == [("test", "file")]
    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(test_folder_connection_path_csv, "test.csv"),
            "reader_options": {
                "header": True
            },
        })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()["col_1"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pyspark_skip = pytest.importorskip("pyspark")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource', base_directory=basepath)

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        })
        assert "Unable to determine reader for path" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="blarg")
        assert "Unknown reader method: blarg" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="excel")
        assert "Unsupported reader: excel" in exc.message

    dataset = datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        },
        reader_method="csv", header=True)
    assert dataset.spark_df.head()["a"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pytest.importorskip("pyspark")
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource',
                                   batch_kwargs_generators={
                                       "subdir_reader": {
                                           "class_name":
                                           "SubdirReaderBatchKwargsGenerator",
                                           "base_directory": basepath
                                       }
                                   })

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            })
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "blarg"
            })
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "excel"
            })
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method":
            "csv",
            "reader_options": {
                'header': True
            }
        })
    assert batch.data.head()["a"] == "1"
Exemplo n.º 7
0
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    datasource = SparkDFDatasource('SparkParquet',
                                   base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": set(['test'])
    }
    dataset = datasource.get_batch('test', header=True)
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'
Exemplo n.º 8
0
def test_invalid_reader_sparkdf_datasource(tmp_path_factory, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource(
        "mysparksource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": basepath,
            }
        },
    )

    with open(
        os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w"
    ) as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
            }
        )
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "blarg",
            }
        )
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "excel",
            }
        )
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method": "csv",
            "reader_options": {"header": True},
        }
    )
    assert batch.data.head()["a"] == "1"
Exemplo n.º 9
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path):
    datasource = SparkDFDatasource('PandasCSV', generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv')
                                   },
                                   reader_options={"header": True})
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'
Exemplo n.º 11
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
Exemplo n.º 12
0
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet',
                                   generators={"subdir_reader": {
                                        "class_name": "SubdirReaderBatchKwargsGenerator",
                                        "base_directory": test_folder_connection_path
                                        }
                                    }
    )

    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv'),
                                       "reader_options": {"header": True}
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == '1'