示例#1
0
def test_dataset_from_pandas_source(tmpdir):
    data_file = tmpdir + '/data.json'
    json_data = [
        {"name": "my name", "birthdate": "2020-10-01", "address": "1234 Main st", "size": 12},
        {"name": "your name", "birthdate": "2020-06-01", "address": "1313 Mockingbird Ln",
         "size": 12}
    ]
    with open(data_file, mode='w') as out:
        json.dump(json_data, out)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    pd_dataset = PandasDataset(pandas.read_json(data_file),
                               **{'batch_kwargs': {'path': 'gcs://my_bucket/path/to/my/data'},
                                  'data_context': ctx})
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')

    datasets = action._fetch_datasets_from_pandas_source(pd_dataset,
                                                         validation_result_suite=result_suite)
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == '/path/to/my/data'
    assert input_ds.namespace == "gcs://my_bucket"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "gcs://my_bucket"
    assert input_ds.facets["dataSource"].uri == 'gcs://my_bucket'

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'object'),
                         SchemaField('birthdate', 'object'),
                         SchemaField('address', 'object'),
                         SchemaField('size', 'int64')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
示例#2
0
def test_DataContextConfig_with_FilesystemStoreBackendDefaults_and_simple_defaults(
    construct_data_context_config, default_pandas_datasource_config
):
    """
    What does this test and why?
    Ensure that a very simple DataContextConfig setup using FilesystemStoreBackendDefaults is created accurately
    This test sets the root_dir parameter
    """

    test_root_directory = "test_root_dir"

    store_backend_defaults = FilesystemStoreBackendDefaults(
        root_directory=test_root_directory
    )
    data_context_config = DataContextConfig(
        datasources={
            "my_pandas_datasource": DatasourceConfig(
                class_name="PandasDatasource",
                batch_kwargs_generators={
                    "subdir_reader": {
                        "class_name": "SubdirReaderBatchKwargsGenerator",
                        "base_directory": "../data/",
                    }
                },
            )
        },
        store_backend_defaults=store_backend_defaults,
    )

    # Create desired config
    data_context_id = data_context_config.anonymous_usage_statistics.data_context_id
    desired_config = construct_data_context_config(
        data_context_id=data_context_id, datasources=default_pandas_datasource_config
    )
    # Add root_directory to stores and data_docs
    desired_config["stores"][desired_config["expectations_store_name"]][
        "store_backend"
    ]["root_directory"] = test_root_directory
    desired_config["stores"][desired_config["validations_store_name"]]["store_backend"][
        "root_directory"
    ] = test_root_directory
    desired_config["stores"][desired_config["checkpoint_store_name"]]["store_backend"][
        "root_directory"
    ] = test_root_directory
    desired_config["data_docs_sites"]["local_site"]["store_backend"][
        "root_directory"
    ] = test_root_directory

    data_context_config_schema = DataContextConfigSchema()
    assert filter_properties_dict(
        properties=data_context_config_schema.dump(data_context_config)
    ) == filter_properties_dict(properties=desired_config)
    assert DataContext.validate_config(project_config=data_context_config)
示例#3
0
    def  build_context(self):
        """
            Purpose:
                Create a dataContext and datasource and add to object 
            Returns:
                saves dataContext and datasource to self
        """
        self.context=ge.get_context()

        #create datasource configuration
        datasource_config = {
            "name": "example_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                },
            },
        }

        #create data context configuration
        data_context_config = DataContextConfig(
            datasources={
                "pandas": DatasourceConfig(
                    class_name="Datasource",
                    execution_engine={
                        "class_name": "PandasExecutionEngine"
                    },
                    data_connectors={
                        "default_runtime_data_connector_name": {
                            "class_name": "RuntimeDataConnector",
                            "batch_identifiers": ["default_identifier_name"],
                        }
                    },
                )
            },
            store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')),
        )

        #build context and add data source
        self.context=BaseDataContext(project_config=data_context_config)
        #self.context.test_yaml_config(yaml.dump(datasource_config))
        self.context.add_datasource(**datasource_config)
示例#4
0
def test_dataset_from_sql_source(test_db_file, tmpdir):
    connection_url = f'sqlite:///{test_db_file}'
    engine = create_engine(connection_url)

    ds = SqlAlchemyDataset(table_name=TABLE_NAME, engine=engine)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')
    datasets = action._fetch_datasets_from_sql_source(ds, result_suite)
    assert datasets is not None
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == TABLE_NAME
    assert input_ds.namespace == "sqlite"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "sqlite"
    assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('birthdate', 'TEXT'),
                         SchemaField('address', 'TEXT'),
                         SchemaField('size', 'INTEGER')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
示例#5
0
    def _get_ge_context_local(ge_project_path: str) -> BaseDataContext:
        """
        This is configured to work with an in-memory pandas DataFrame.
        This setup allows us to run validations before (perhaps unnecessarily) writing any data
        to disk, as well as at any other stage.

        Currently using local storage.

        Args:
        ge_project_path (str): The path to the Great Expectations project,
        eg. `/home/viadot/my_flow`. Expectation suites need to be placed inside the
        `expectations` folder, eg. `/home/viadot/my_flow/expectations/failure.json`.

        Returns:
            BaseDataContext: The GE context (ie. config) required to run the validations.
        """
        data_context_config = DataContextConfig(
            datasources={
                "pandas": DatasourceConfig(
                    class_name="PandasDatasource",
                    batch_kwargs_generators={},  # override the CSV default
                )
            },
            store_backend_defaults=FilesystemStoreBackendDefaults(ge_project_path),
            validation_operators={
                "action_list_operator": {
                    "class_name": "ActionListValidationOperator",
                    "action_list": [
                        {
                            "name": "store_validation_result",
                            "action": {"class_name": "StoreValidationResultAction"},
                        },
                        {
                            "name": "store_evaluation_params",
                            "action": {"class_name": "StoreEvaluationParametersAction"},
                        },
                        {
                            "name": "update_data_docs",
                            "action": {"class_name": "UpdateDataDocsAction"},
                        },
                    ],
                }
            },
        )
        context = BaseDataContext(project_config=data_context_config)
        return context
示例#6
0
# 1. Install Great Expectations
# %pip install great-expectations
# Imports

# 2. Set up Great Expectations
# In-memory DataContext using DBFS and FilesystemStoreBackendDefaults

# CODE vvvvv vvvvv
# This root directory is for use in Databricks
root_directory = "/dbfs/great_expectations/"

# For testing purposes only, we change the root_directory to an ephemeral location created by our test runner
root_directory = os.path.join(os.getcwd(), "dbfs_temp_directory")

data_context_config = DataContextConfig(
    store_backend_defaults=FilesystemStoreBackendDefaults(
        root_directory=root_directory), )
context = BaseDataContext(project_config=data_context_config)
# CODE ^^^^^ ^^^^^

# ASSERTIONS vvvvv vvvvv
# Check the stores were initialized
uncommitted_directory = os.path.join(root_directory, "uncommitted")
assert {"checkpoints", "expectations",
        "uncommitted"}.issubset(set(os.listdir(root_directory)))
assert os.listdir(uncommitted_directory) == ["validations"]
# ASSERTIONS ^^^^^ ^^^^^

# 3. Prepare your data

# See guide
示例#7
0
def test_dataset_from_custom_sql(test_db_file, tmpdir):
    connection_url = f'sqlite:///{test_db_file}'
    engine = create_engine(connection_url)
    engine.execute("""CREATE TABLE join_table (name text, workplace text, position text)""")
    custom_sql = f"""SELECT * FROM {TABLE_NAME} t INNER JOIN join_table j ON t.name=j.name"""

    # note the batch_kwarg key is 'query', but the constructor arg is 'custom_sql'
    ds = SqlAlchemyDataset(engine=engine,
                           custom_sql=custom_sql,
                           batch_kwargs={'query': custom_sql})

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')
    datasets = action._fetch_datasets_from_sql_source(ds, result_suite)
    assert datasets is not None
    assert len(datasets) == 2
    assert all(name in [TABLE_NAME, 'join_table'] for name in [ds.name for ds in datasets])

    input_ds = next(ds for ds in datasets if ds.name == TABLE_NAME)

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "sqlite"
    assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('birthdate', 'TEXT'),
                         SchemaField('address', 'TEXT'),
                         SchemaField('size', 'INTEGER')])
    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])

    input_ds = next(ds for ds in datasets if ds.name == 'join_table')
    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 3
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('workplace', 'TEXT'),
                         SchemaField('position', 'TEXT')])
    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
GE was installed inside a zip file -which is a location allowed by PEP 273-).

Therefore, this test is intended to be run after installing GE inside a zip file and
then setting the appropriate PYTHONPATH env variable. If desired, this test can also be
run after installing GE in a normal filesystem location (i.e. a directory).

This test is OK if it finishes without raising an exception.

To make it easier to debug this test, it prints:
* The location of the GE library: to verify that we are testing the library that we want
* The version of the GE library: idem
* data_docs url: If everything works, this will be a url (e.g. starting with file://...)


Additional info: https://github.com/great-expectations/great_expectations/issues/3772 and
https://www.python.org/dev/peps/pep-0273/
"""

print(f"Great Expectations location: {ge.__file__}")
print(f"Great Expectations version: {ge.__version__}")

data_context_config = DataContextConfig(
    datasources={
        "example_datasource": DatasourceConfig(class_name="PandasDatasource")
    },
    store_backend_defaults=FilesystemStoreBackendDefaults(
        root_directory=tempfile.mkdtemp() + os.sep + "my_greatexp_workdir"),
)
context = BaseDataContext(project_config=data_context_config)
print(f"Great Expectations data_docs url: {context.build_data_docs()}")