Пример #1
0
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_without_citations(
    caplog,
    site_builder_data_context_with_html_store_titanic_random,
):
    """
    Here we verify that the "suite edit" command helps the user to specify the batch
    kwargs when it is called without the optional arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our test
    will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments. This means that
    the command will help us specify the batch kwargs interactively.

    The data context has two datasources - we choose one of them. It has a generator
    configured. We choose to use the generator and select a generator asset from the list.
    """
    root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "new", "-d", root_dir, "--suite", "foo_suite", "--no-view"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )
    stdout = result.stdout
    assert result.exit_code == 0
    assert "A new Expectation suite 'foo_suite' was added to your project" in stdout

    # remove the citations from the suite
    context = DataContext(root_dir)
    suite = context.get_expectation_suite("foo_suite")
    suite.meta.pop("citations")
    context.save_expectation_suite(suite)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "edit", "foo_suite", "-d", root_dir, "--no-jupyter"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Select a datasource" in stdout
    assert "Which data would you like to use" in stdout
    assert "To continue editing this suite, run" in stdout

    expected_notebook_path = os.path.join(root_dir, "uncommitted",
                                          "foo_suite.ipynb")
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations",
                                       "foo_suite.json")
    assert os.path.isfile(expected_suite_path)
    assert_no_logging_messages_or_tracebacks(caplog, result)
Пример #2
0
def load_expectation_suite(context: DataContext,
                           suite_name: str) -> ExpectationSuite:
    """
    Load an expectation suite from a given context.

    Handles a suite name with or without `.json`
    """
    if suite_name.endswith(".json"):
        suite_name = suite_name[:-5]
    try:
        suite = context.get_expectation_suite(suite_name)
        return suite
    except ge_exceptions.DataContextError as e:
        cli_message(
            f"<red>Could not find a suite named `{suite_name}`.</red> Please check "
            "the name by running `great_expectations suite list` and try again."
        )
        logger.info(e)
        # TODO this should try to send a usage statistic failure
        sys.exit(1)
Пример #3
0
def load_expectation_suite(
    # TODO consolidate all the myriad CLI tests into this
    context: DataContext,
    suite_name: str,
    usage_event: str,
) -> ExpectationSuite:
    """
    Load an expectation suite from a given context.

    Handles a suite name with or without `.json`
    :param usage_event:
    """
    if suite_name.endswith(".json"):
        suite_name = suite_name[:-5]
    try:
        suite = context.get_expectation_suite(suite_name)
        return suite
    except ge_exceptions.DataContextError as e:
        exit_with_failure_message_and_stats(
            context,
            usage_event,
            f"<red>Could not find a suite named `{suite_name}`.</red> Please check "
            "the name by running `great_expectations suite list` and try again.",
        )
Пример #4
0
def test_suite_new_empty_with_no_jupyter(
    mock_webbroser,
    mock_subprocess,
    caplog,
    data_context_parameterized_expectation_suite,
    filesystem_csv_2,
):
    """
    Running "suite new --no-jupyter" should:
    - make an empty suite
    - NOT open jupyter
    - NOT open data docs
    """
    os.mkdir(
        os.path.join(
            data_context_parameterized_expectation_suite.root_directory, "uncommitted"
        )
    )
    root_dir = data_context_parameterized_expectation_suite.root_directory
    runner = CliRunner(mix_stderr=False)
    csv = os.path.join(filesystem_csv_2, "f1.csv")
    # TODO this test must be updated to remove the --empty flag in the next major release
    result = runner.invoke(
        cli,
        ["suite", "new", "-d", root_dir, "--empty", "--suite", "foo", "--no-jupyter"],
        input=f"{csv}\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Enter the path" in stdout
    assert "Name the new expectation suite" not in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        not in stdout
    )
    assert "Generating example Expectation Suite..." not in stdout
    assert "The following Data Docs sites were built" not in stdout
    assert (
        "Great Expectations will create a new Expectation Suite 'foo' and store it here"
        in stdout
    )
    assert "open a notebook for you now" not in stdout

    expected_suite_path = os.path.join(root_dir, "expectations", "foo.json")
    assert os.path.isfile(expected_suite_path)

    expected_notebook = os.path.join(root_dir, "uncommitted", "edit_foo.ipynb")
    assert os.path.isfile(expected_notebook)

    context = DataContext(root_dir)
    assert "foo" in context.list_expectation_suite_names()
    suite = context.get_expectation_suite("foo")
    assert suite.expectations == []
    citations = suite.get_citations()
    citations[0].pop("citation_date")
    assert citations[0] == {
        "batch_kwargs": {
            "data_asset_name": "f1",
            "datasource": "mydatasource",
            "path": csv,
            "reader_method": "read_csv",
        },
        "batch_markers": None,
        "batch_parameters": None,
        "comment": "New suite added via CLI",
    }

    assert mock_subprocess.call_count == 0
    assert mock_webbroser.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
Пример #5
0
def test_suite_edit_one_datasources_no_generator_with_no_additional_args_and_no_citations(
    mock_webbrowser, mock_subprocess, caplog, empty_data_context, filesystem_csv_2
):
    """
    Here we verify that the "suite edit" command helps the user to specify the batch
    kwargs when it is called without the optional arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our test
    will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments. This means that
    the command will help us specify the batch kwargs interactively.

    The data context has one datasource. The datasource has no generators
    configured. The command prompts us to enter the file path.
    """
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
    )

    not_so_empty_data_context = empty_data_context
    project_root_dir = not_so_empty_data_context.root_directory

    root_dir = project_root_dir
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir],
        input="{0:s}\nmy_new_suite\n\n".format(
            os.path.join(filesystem_csv_2, "f1.csv")
        ),
        catch_exceptions=False,
    )
    stdout = result.stdout
    assert mock_webbrowser.call_count == 1
    assert mock_subprocess.call_count == 0
    mock_subprocess.reset_mock()
    mock_webbrowser.reset_mock()
    assert result.exit_code == 0
    assert (
        "Great Expectations will store these expectations in a new Expectation Suite 'my_new_suite' here:"
        in stdout
    )

    # remove the citations from the suite
    context = DataContext(project_root_dir)
    suite = context.get_expectation_suite("my_new_suite")
    suite.meta.pop("citations")
    context.save_expectation_suite(suite)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "edit", "my_new_suite", "-d", root_dir],
        input="{0:s}\n\n".format(os.path.join(filesystem_csv_2, "f1.csv")),
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Select a datasource" not in stdout
    assert "Which data would you like to use" not in stdout
    assert "Enter the path" in stdout

    expected_notebook_path = os.path.join(
        root_dir, "uncommitted", "edit_my_new_suite.ipynb"
    )
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
def test_notebook_execution_with_pandas_backend(
        titanic_data_context_no_data_docs):
    """
    This tests that the notebook is written to disk and executes without error.

    To set this test up we:
    - create a scaffold notebook
    - verify that no validations have happened

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    # Since we'll run the notebook, we use a context with no data docs to avoid
    # the renderer's default behavior of building and opening docs, which is not
    # part of this test.
    context = titanic_data_context_no_data_docs
    root_dir = context.root_directory
    uncommitted_dir = os.path.join(root_dir, "uncommitted")
    suite_name = "my_suite"
    suite = context.create_expectation_suite(suite_name)

    csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv")
    batch_kwargs = {"datasource": "mydatasource", "path": csv_path}

    # Sanity check test setup
    assert context.list_expectation_suite_names() == [suite_name]
    assert context.list_datasources() == [{
        "module_name": "great_expectations.datasource",
        "class_name": "PandasDatasource",
        "data_asset_type": {
            "module_name": "great_expectations.dataset",
            "class_name": "PandasDataset",
        },
        "batch_kwargs_generators": {
            "mygenerator": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": "../data",
            }
        },
        "name": "mydatasource",
    }]
    assert context.get_validation_result(suite_name) == {}
    notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb")
    assert not os.path.isfile(notebook_path)

    # Create notebook
    renderer = SuiteScaffoldNotebookRenderer(titanic_data_context_no_data_docs,
                                             suite, batch_kwargs)
    renderer.render_to_disk(notebook_path)
    assert os.path.isfile(notebook_path)

    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)

    # Run notebook
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    # Useful to inspect executed notebook
    output_notebook = os.path.join(uncommitted_dir, "output.ipynb")
    with open(output_notebook, "w") as f:
        nbformat.write(nb, f)

    # Assertions about output
    context = DataContext(root_dir)
    obs_validation_result = context.get_validation_result(suite_name)
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 3,
        "successful_expectations": 3,
        "unsuccessful_expectations": 0,
        "success_percent": 100,
    }
    suite = context.get_expectation_suite(suite_name)
    assert suite.expectations
saves validation results to your results store and then updates Data Docs.

This makes viewing validation results easy for you and your team.

Usage:
- Run this file: `python {0}`.
- This can be run manually or via a scheduler such as cron.
- If your pipeline runner supports python snippets you can paste this into your
pipeline.
"""
import sys

from great_expectations import DataContext

# checkpoint configuration
context = DataContext("{1}")
suite = context.get_expectation_suite("{2}")
# You can modify your BatchKwargs to select different data
batch_kwargs = {3}

# checkpoint validation process
batch = context.get_batch(batch_kwargs, suite)
results = context.run_validation_operator("action_list_operator", [batch])

if not results["success"]:
    print("Validation Failed!")
    sys.exit(1)

print("Validation Succeeded!")
sys.exit(0)
Пример #8
0
import sys

from great_expectations import DataContext

# checkpoint configuration
context = DataContext("/home/demilsonfayika/staging/results")
suite = context.get_expectation_suite("staging.validation")
# You can modify your BatchKwargs to select different data
batch_kwargs = {
    "table": "staging",
    "schema": "public",
    "datasource": "stagingtable"
}

# checkpoint validation process
batch = context.get_batch(batch_kwargs, suite)
results = context.run_validation_operator("action_list_operator", [batch])

if not results["success"]:
    print('{"result":"fail"}')
    sys.exit(0)

print('{"result":"pass"}')
sys.exit(0)
Пример #9
0
def test_suite_new_empty_suite_creates_empty_suite(mock_webbroser,
                                                   mock_subprocess, caplog,
                                                   data_context,
                                                   filesystem_csv_2):
    """
    Running "suite new --empty" should:
    - make an empty suite
    - open jupyter
    - NOT open data docs
    """
    project_root_dir = data_context.root_directory
    os.mkdir(os.path.join(project_root_dir, "uncommitted"))
    root_dir = project_root_dir
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    csv = os.path.join(filesystem_csv_2, "f1.csv")
    result = runner.invoke(
        cli,
        ["suite", "new", "-d", root_dir, "--empty", "--suite", "foo"],
        input=f"{csv}\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Enter the path" in stdout
    assert "Name the new expectation suite" not in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        not in stdout)
    assert "Generating example Expectation Suite..." not in stdout
    assert "The following Data Docs sites were built" not in stdout
    assert "A new Expectation suite 'foo' was added to your project" in stdout
    assert (
        "Because you requested an empty suite, we'll open a notebook for you now to edit it!"
        in stdout)

    expected_suite_path = os.path.join(root_dir, "expectations", "foo.json")
    assert os.path.isfile(expected_suite_path)

    expected_notebook = os.path.join(root_dir, "uncommitted", "foo.ipynb")
    assert os.path.isfile(expected_notebook)

    context = DataContext(root_dir)
    assert "foo" in context.list_expectation_suite_names()
    suite = context.get_expectation_suite("foo")
    assert suite.expectations == []
    citations = suite.get_citations()
    citations[0].pop("citation_date")
    assert citations[0] == {
        "batch_kwargs": {
            "datasource": "mydatasource",
            "path": csv,
            "reader_method": "read_csv",
        },
        "batch_markers": None,
        "batch_parameters": None,
        "comment": "New suite added via CLI",
    }

    assert mock_subprocess.call_count == 1
    call_args = mock_subprocess.call_args[0][0]
    assert call_args[0] == "jupyter"
    assert call_args[1] == "notebook"
    assert expected_notebook in call_args[2]

    assert mock_webbroser.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_init_on_new_project_extra_whitespace_in_url(
        mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = create_engine("sqlite:///{}".format(database_path))
    engine_url_with_added_whitespace = "    " + str(engine.url) + "  "

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format(
            engine_url_with_added_whitespace, catch_exceptions=False),
    )
    stdout = result.output
    assert len(stdout) < 3000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new data source a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Name the new expectation suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "A new Expectation suite 'warning' was added to your project" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "titanic"
    }]

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 13

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir,
                               "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
        project_dir) in mock_webbrowser.call_args[0][0]
def test_notebook_execution_rule_based_profiler_with_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
    bobby_columnar_table_multi_batch,
):
    """
    To set this test up we:

    - create a suite using Rule-Based Profiler
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=context,
    )

    profiler_name: str = "bobby_user_workflow"

    context.save_profiler(
        profiler=profiler,
        name=profiler_name,
    )

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=profiler_name,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 13,
        "successful_expectations": 13,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    expected_expectation_configurations: List[ExpectationConfiguration] = [
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {},
                            "metric_dependencies": None,
                            "metric_name": "table.row_count",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "expectation_type": "expect_table_row_count_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Unnamed: 0"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Unnamed: 0",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Unnamed: 0"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Unnamed: 0",
                    "max_value": 1313,
                    "min_value": 1313,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Age"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Age",
                    "max_value": 0.17,
                    "min_value": 0.17,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Age"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Age",
                    "max_value": 71.0,
                    "min_value": 71.0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Survived"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Survived",
                    "max_value": 0,
                    "min_value": 0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Survived"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Survived",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "SexCode"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "SexCode",
                    "max_value": 0,
                    "min_value": 0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "SexCode"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "SexCode",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "PClass",
                    "value_set": [
                        "*",
                        "1st",
                        "2nd",
                        "3rd",
                    ],
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "Sex",
                    "value_set": ["female", "male"]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "Survived",
                    "value_set": [0, 1]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "SexCode",
                    "value_set": [0, 1]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
    ]

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)

    expectation_configurations: List[ExpectationConfiguration] = []
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in suite.expectations:
        kwargs: dict = expectation_configuration.kwargs
        key: str
        value: Any
        kwargs = {
            key: sorted(value) if isinstance(value,
                                             (list, set, tuple)) else value
            for key, value in kwargs.items()
        }
        expectation_configuration.kwargs = kwargs
        expectation_configurations.append(expectation_configuration)

    assert expectation_configurations == expected_expectation_configurations
def test_notebook_execution_onboarding_data_assistant_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    """
    To set this test up we:

    - create a suite using User-Configurable Profiler
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=None,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    expected_expectation_configurations: List[ExpectationConfiguration] = [
        ExpectationConfiguration(
            **{
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "expectation_type": "expect_table_row_count_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {},
                            "metric_dependencies": None,
                            "metric_name": "table.row_count",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
            }),
        ExpectationConfiguration(
            **{
                "kwargs": {
                    "column_set": [
                        "Age",
                        "Name",
                        "PClass",
                        "Sex",
                        "SexCode",
                        "Survived",
                        "Unnamed: 0",
                    ],
                    "exact_match":
                    None,
                },
                "expectation_type": "expect_table_columns_to_match_set",
                "meta": {
                    "profiler_details": {
                        "success_ratio": 1.0
                    }
                },
            }),
    ]

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)

    expectation_configurations: List[ExpectationConfiguration] = []
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in suite.expectations:
        kwargs: dict = expectation_configuration.kwargs
        key: str
        value: Any
        kwargs = {
            key: sorted(value) if isinstance(value,
                                             (list, set, tuple)) else value
            for key, value in kwargs.items()
        }
        expectation_configuration.kwargs = kwargs
        expectation_configurations.append(expectation_configuration)

    assert expectation_configurations == expected_expectation_configurations

    columns_with_expectations: Set[str]
    expectations_from_suite: Set[str]
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite=suite)

    expected_expectations: Set[str] = {
        "expect_table_row_count_to_be_between",
        "expect_table_columns_to_match_set",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations
Пример #13
0
pipeline.
"""
import sys

from great_expectations import DataContext

# checkpoint configuration
context = DataContext("{1}")
checkpoint = context.get_checkpoint("{0}")

# load batches of data
batches_to_validate = []
for batch in checkpoint["batches"]:
    batch_kwargs = batch["batch_kwargs"]
    for suite_name in batch["expectation_suite_names"]:
        suite = context.get_expectation_suite(suite_name)
        batch = context.get_batch(batch_kwargs, suite)
        batches_to_validate.append(batch)

# run the validation operator
results = context.run_validation_operator(
    checkpoint["validation_operator_name"],
    assets_to_validate=batches_to_validate,
    # TODO prepare for new RunID - checkpoint name and timestamp
    # run_id=RunID(checkpoint)
)

# take action based on results
if not results["success"]:
    print("Validation Failed!")
    sys.exit(1)
Пример #14
0
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_without_citations(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    site_builder_data_context_with_html_store_titanic_random,
):
    """
    Here we verify that the "suite edit" command helps the user to specify the batch
    kwargs when it is called without the optional arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our test
    will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments. This means that
    the command will help us specify the batch kwargs interactively.

    The data context has two datasources - we choose one of them. It has a generator
    configured. We choose to use the generator and select a generator asset from the list.

    The command should:
    - NOT open Data Docs
    - open jupyter
    """
    root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir, "--suite", "foo_suite"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 2
    assert mock_subprocess.call_count == 0
    mock_webbrowser.reset_mock()
    mock_subprocess.reset_mock()

    # remove the citations from the suite
    context = DataContext(root_dir)
    suite = context.get_expectation_suite("foo_suite")
    assert isinstance(suite, ExpectationSuite)
    suite.meta.pop("citations")
    context.save_expectation_suite(suite)

    # Actual testing really starts here
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "edit", "foo_suite", "-d", root_dir,],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "A batch of data is required to edit the suite" in stdout
    assert "Select a datasource" in stdout
    assert "Which data would you like to use" in stdout

    expected_notebook_path = os.path.join(
        root_dir, "uncommitted", "edit_foo_suite.ipynb"
    )
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations", "foo_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
Пример #15
0
def test_cli_init_on_new_project_extra_whitespace_in_url(
        mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file, sa):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = sa.create_engine(f"sqlite:///{database_path}", pool_recycle=3600)
    engine_url_with_added_whitespace = "    " + str(engine.url) + "  "

    inspector = sa.inspect(engine)

    # get the default schema and table for testing
    schemas = inspector.get_schema_names()
    default_schema = schemas[0]

    tables = [
        table_name
        for table_name in inspector.get_table_names(schema=default_schema)
    ]
    default_table = tables[0]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input=
        "\n\n2\n6\ntitanic\n{url}\n\n\n1\n{schema}\n{table}\nwarning\n\n\n\n".
        format(
            url=engine_url_with_added_whitespace,
            schema=default_schema,
            table=default_table,
        ),
        catch_exceptions=False,
    )
    stdout = result.output
    assert len(stdout) < 6000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new Datasource a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert (
        "You have selected a datasource that is a SQL database. How would you like to specify the data?"
        in stdout)
    assert "Name the new Expectation Suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "titanic",
        "module_name": "great_expectations.datasource",
        "credentials": {
            "url": str(engine.url)
        },
        "data_asset_type": {
            "class_name": "SqlAlchemyDataset",
            "module_name": "great_expectations.dataset",
        },
    }]

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 14

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir,
                               "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/"
        .format(project_dir) in mock_webbrowser.call_args[0][0])
Пример #16
0
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_containing_citations(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    site_builder_data_context_with_html_store_titanic_random,
):
    """
    Here we verify that the "suite edit" command uses the batch kwargs found in
    citations in the existing suite when it is called without the optional
    arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our
    test will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments.

    The command should:
    - NOT open Data Docs
    - NOT open jupyter
    """
    root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir, "--suite", "foo_suite"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )
    assert mock_webbrowser.call_count == 2
    assert mock_subprocess.call_count == 0
    mock_subprocess.reset_mock()
    mock_webbrowser.reset_mock()
    assert result.exit_code == 0
    context = DataContext(root_dir)
    suite = context.get_expectation_suite("foo_suite")
    assert isinstance(suite, ExpectationSuite)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "edit", "foo_suite", "-d", root_dir],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Select a datasource" not in stdout
    assert "Which data would you like to use" not in stdout

    expected_notebook_path = os.path.join(
        root_dir, "uncommitted", "edit_foo_suite.ipynb"
    )
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations", "foo_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
Пример #17
0
def test_cli_init_on_new_project(mock_webbrowser, caplog, tmp_path_factory,
                                 titanic_sqlite_db_file, sa):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = sa.create_engine(f"sqlite:///{database_path}", pool_recycle=3600)

    inspector = sa.inspect(engine)

    # get the default schema and table for testing
    schemas = inspector.get_schema_names()
    default_schema = schemas[0]

    tables = [
        table_name
        for table_name in inspector.get_table_names(schema=default_schema)
    ]
    default_table = tables[0]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input=
        "\n\n2\n6\ntitanic\n{url}\n\n\n1\n{schema}\n{table}\nwarning\n\n\n\n".
        format(url=engine.url, schema=default_schema, table=default_table),
        catch_exceptions=False,
    )
    stdout = result.output
    assert len(stdout) < 6000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new Datasource a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert (
        "You have selected a datasource that is a SQL database. How would you like to specify the data?"
        in stdout)
    assert "Name the new Expectation Suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources(
    )[0]["class_name"] == "SqlAlchemyDatasource"
    assert context.list_datasources()[0]["name"] == "titanic"

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 14

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir,
                               "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    # Profilers are v014+ specific
    os.rmdir(os.path.join(ge_dir, "profilers"))

    obs_tree = gen_directory_tree_str(ge_dir)

    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(r"[a-z0-9]{32}(?=\.(json|html))",
                                "foobarbazguid", obs_tree)
    # print(guid_safe_obs_tree)
    assert (guid_safe_obs_tree == """\
great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
    expectations/
        .ge_store_backend_id
        warning.json
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    warning/
                        20190926T134241.000000Z/
                            20190926T134241.000000Z/
                                foobarbazguid.html
        validations/
            .ge_store_backend_id
            warning/
                20190926T134241.000000Z/
                    20190926T134241.000000Z/
                        foobarbazguid.json
""")

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/"
        .format(project_dir) in mock_webbrowser.call_args[0][0])
def test_cli_init_on_new_project(mock_webbrowser, caplog, tmp_path_factory,
                                 titanic_sqlite_db_file):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = create_engine("sqlite:///{}".format(database_path))

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format(
            engine.url, catch_exceptions=False),
    )
    stdout = result.output
    assert len(stdout) < 3000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new data source a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Name the new expectation suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "A new Expectation suite 'warning' was added to your project" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "titanic"
    }]

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 13

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir,
                               "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    obs_tree = gen_directory_tree_str(ge_dir)

    # Instead of monkey patching datetime, just regex out the time directories
    date_safe_obs_tree = re.sub(r"\d*T\d*\.\d*Z", "9999.9999", obs_tree)
    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(r"[a-z0-9]{32}(?=\.(json|html))",
                                "foobarbazguid", date_safe_obs_tree)
    assert (guid_safe_obs_tree == """\
great_expectations/
    .gitignore
    great_expectations.yml
    expectations/
        warning.json
    notebooks/
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    warning/
                        9999.9999/
                            foobarbazguid.html
        validations/
            warning/
                9999.9999/
                    foobarbazguid.json
""")

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
        project_dir) in mock_webbrowser.call_args[0][0]
import sys
from great_expectations import DataContext
# checkpoint configuration
context = DataContext("/home/paulcrickard/peoplepipeline/great_expectations")
suite = context.get_expectation_suite("people.validate")
# You can modify your BatchKwargs to select different data
batch_kwargs = {
    "path": "/home/paulcrickard/peoplepipeline/people.csv",
    "datasource": "files_datasource",
    "reader_method": "read_csv",
}

# checkpoint validation process
batch = context.get_batch(batch_kwargs, suite)
results = context.run_validation_operator("action_list_operator", [batch])

if not results["success"]:
    print('{"result":"fail"}')
    sys.exit(0)

print('{"result":"pass"}')
sys.exit(0)