Python SparkDFDatasource.get_batch 예제들, great_expectations.datasource.SparkDFDatasource.get_batch Python 예제들

예제 #1

0

파일 보기

def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', generators={
    "subdir_reader": {
        "class_name": "SubdirReaderBatchKwargsGenerator",
        "base_directory": test_parquet_folder_connection_path
    }
}
)


    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 5

    # Limit should also work
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 2

예제 #2

0

파일 보기

파일: test_sparkdf_datasource.py 프로젝트: tfeusels/great_expectations

def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 5

    # Limit should also work
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 2

예제 #3

0

파일 보기

def test_spark_datasource_processes_dataset_options(
        test_folder_connection_path_csv, test_backends, empty_data_context):
    context: DataContext = empty_data_context
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader",
                                                 data_asset_name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(
        batch,
        ExpectationSuite(expectation_suite_name="foo", data_context=context))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False

예제 #4

0

파일 보기

def test_standalone_spark_csv_datasource(test_folder_connection_path_csv,
                                         test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "SparkParquet",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    assert datasource.get_available_data_asset_names(
    )["subdir_reader"]["names"] == [("test", "file")]
    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(test_folder_connection_path_csv, "test.csv"),
            "reader_options": {
                "header": True
            },
        })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()["col_1"] == "1"

예제 #5

0

파일 보기

파일: test_datasources.py 프로젝트: scarrucciu/great_expectations

def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pyspark_skip = pytest.importorskip("pyspark")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource', base_directory=basepath)

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        })
        assert "Unable to determine reader for path" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="blarg")
        assert "Unknown reader method: blarg" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="excel")
        assert "Unsupported reader: excel" in exc.message

    dataset = datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        },
        reader_method="csv", header=True)
    assert dataset.spark_df.head()["a"] == "1"

예제 #6

0

파일 보기

파일: test_sparkdf_datasource.py 프로젝트: tsanikgr/great_expectations

def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pytest.importorskip("pyspark")
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource',
                                   batch_kwargs_generators={
                                       "subdir_reader": {
                                           "class_name":
                                           "SubdirReaderBatchKwargsGenerator",
                                           "base_directory": basepath
                                       }
                                   })

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            })
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "blarg"
            })
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "excel"
            })
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method":
            "csv",
            "reader_options": {
                'header': True
            }
        })
    assert batch.data.head()["a"] == "1"

예제 #7

0

파일 보기

파일: test_datasources.py 프로젝트: cuulee/great_expectations

def test_standalone_spark_csv_datasource(test_folder_connection_path):
    datasource = SparkDFDatasource('SparkParquet',
                                   base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": set(['test'])
    }
    dataset = datasource.get_batch('test', header=True)
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'

예제 #8

0

파일 보기

def test_invalid_reader_sparkdf_datasource(tmp_path_factory, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource(
        "mysparksource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": basepath,
            }
        },
    )

    with open(
        os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w"
    ) as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
            }
        )
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "blarg",
            }
        )
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "excel",
            }
        )
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method": "csv",
            "reader_options": {"header": True},
        }
    )
    assert batch.data.head()["a"] == "1"

예제 #9

0

파일 보기

def test_pandas_datasource_processes_dataset_options(test_folder_connection_path):
    datasource = SparkDFDatasource('PandasCSV', generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False

예제 #10

0

파일 보기

파일: test_sparkdf_datasource.py 프로젝트: tfeusels/great_expectations

def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv')
                                   },
                                   reader_options={"header": True})
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'

예제 #11

0

파일 보기

def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False

예제 #12

0

파일 보기

def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet',
                                   generators={"subdir_reader": {
                                        "class_name": "SubdirReaderBatchKwargsGenerator",
                                        "base_directory": test_folder_connection_path
                                        }
                                    }
    )

    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv'),
                                       "reader_options": {"header": True}
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == '1'