예제 #1
0
def test_spark_datasource_processes_dataset_options(
        test_folder_connection_path_csv, test_backends, empty_data_context):
    context: DataContext = empty_data_context
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader",
                                                 data_asset_name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(
        batch,
        ExpectationSuite(expectation_suite_name="foo", data_context=context))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
def test_sqlalchemy_source_limit(sqlitedb_engine, empty_data_context):
    context: DataContext = empty_data_context
    df1 = pd.DataFrame({
        "col_1": [1, 2, 3, 4, 5],
        "col_2": ["a", "b", "c", "d", "e"]
    })
    df2 = pd.DataFrame({
        "col_1": [0, 1, 2, 3, 4],
        "col_2": ["b", "c", "d", "e", "f"]
    })
    df1.to_sql(name="table_1", con=sqlitedb_engine, index=True)
    df2.to_sql(name="table_2", con=sqlitedb_engine, index=True, schema="main")
    datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine)
    limited_batch = datasource.get_batch({
        "table": "table_1",
        "limit": 1,
        "offset": 2
    })
    assert isinstance(limited_batch, Batch)
    limited_dataset = BridgeValidator(
        limited_batch,
        expectation_suite=ExpectationSuite("test", data_context=context),
        expectation_engine=SqlAlchemyDataset,
    ).get_dataset()
    assert limited_dataset._table.name.startswith(
        "ge_temp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)["col_1"][0] == 3  # offset should have been applied
def test_sqlalchemy_source_templating(sqlitedb_engine, empty_data_context):
    context: DataContext = empty_data_context
    datasource = SqlAlchemyDatasource(
        engine=sqlitedb_engine,
        batch_kwargs_generators={
            "foo": {
                "class_name": "QueryBatchKwargsGenerator"
            }
        },
    )
    generator = datasource.get_batch_kwargs_generator("foo")
    generator.add_query(data_asset_name="test",
                        query="select 'cat' as ${col_name};")
    batch = datasource.get_batch(
        generator.build_batch_kwargs(
            "test", query_parameters={"col_name": "animal_name"}))
    dataset = BridgeValidator(
        batch,
        expectation_suite=ExpectationSuite("test", data_context=context),
        expectation_engine=SqlAlchemyDataset,
    ).get_dataset()
    res = dataset.expect_column_to_exist("animal_name")
    assert res.success is True
    res = dataset.expect_column_values_to_be_in_set("animal_name", ["cat"])
    assert res.success is True
예제 #4
0
def test_sqlalchemy_datasource_processes_dataset_options(
        test_db_connection_string):
    datasource = SqlAlchemyDatasource(
        "SqlAlchemy", credentials={"url": test_db_connection_string})
    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": False})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(batch,
                                ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False

    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": True})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(batch,
                                ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is True

    batch_kwargs = {
        "query": "select * from table_1;",
        "dataset_options": {
            "caching": False
        },
    }
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(batch,
                                ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path_csv):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs(
        "subdir_reader", data_asset_name="test"
    )
    batch_kwargs["dataset_options"] = {"caching": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
예제 #6
0
def _get_batch_kwargs_for_sqlalchemy_datasource(context,
                                                datasource_name,
                                                additional_batch_kwargs=None):
    data_asset_name = None
    sql_query = None
    datasource = context.get_datasource(datasource_name)
    msg_prompt_how_to_connect_to_data = """
You have selected a datasource that is a SQL database. How would you like to specify the data?
1. Enter a table name and schema
2. Enter a custom SQL query
3. List all tables in the database (this may take a very long time)
"""
    default_schema = _get_default_schema(datasource)
    temp_generator = TableBatchKwargsGenerator(name="temp",
                                               datasource=datasource)

    while data_asset_name is None:
        single_or_multiple_data_asset_selection = click.prompt(
            msg_prompt_how_to_connect_to_data,
            type=click.Choice(["1", "2", "3"]),
            show_choices=False,
        )
        if single_or_multiple_data_asset_selection == "1":  # name the table and schema
            schema_name = click.prompt(
                "Please provide the schema name of the table (this is optional)",
                default=default_schema,
            )
            table_name = click.prompt(
                "Please provide the table name (this is required)")
            data_asset_name = f"{schema_name}.{table_name}"

        elif single_or_multiple_data_asset_selection == "2":  # SQL query
            sql_query = click.prompt("Please provide the SQL query")
            data_asset_name = "custom_sql_query"

        elif single_or_multiple_data_asset_selection == "3":  # list it all
            msg_prompt_warning = fr"""Warning: If you have a large number of tables in your datasource, this may take a very long time. \m
                    Would you like to continue?"""
            confirmation = click.prompt(msg_prompt_warning,
                                        type=click.Choice(["y", "n"]),
                                        show_choices=True)
            if confirmation == "y":
                # avoid this call until necessary
                available_data_asset_names = (
                    temp_generator.get_available_data_asset_names()["names"])
                available_data_asset_names_str = [
                    "{} ({})".format(name[0], name[1])
                    for name in available_data_asset_names
                ]

                data_asset_names_to_display = available_data_asset_names_str
                choices = "\n".join([
                    "    {}. {}".format(i, name)
                    for i, name in enumerate(data_asset_names_to_display, 1)
                ])
                msg_prompt_enter_data_asset_name = (
                    "\nWhich table would you like to use? (Choose one)\n")
                prompt = msg_prompt_enter_data_asset_name + choices + os.linesep
                selection = click.prompt(prompt, show_default=False)
                selection = selection.strip()
                try:
                    data_asset_index = int(selection) - 1
                    try:
                        data_asset_name = [
                            name[0] for name in available_data_asset_names
                        ][data_asset_index]

                    except IndexError:
                        print(
                            f"You have specified {selection}, which is an incorrect index"
                        )
                        pass
                except ValueError:
                    print(
                        f"You have specified {selection}, which is an incorrect value"
                    )
                    pass

    if additional_batch_kwargs is None:
        additional_batch_kwargs = {}

    # Some backends require named temporary table parameters. We specifically elicit those and add them
    # where appropriate.
    temp_table_kwargs = dict()
    datasource = context.get_datasource(datasource_name)

    if datasource.engine.dialect.name.lower() == "bigquery":
        # bigquery also requires special handling
        bigquery_temp_table = click.prompt(
            "Great Expectations will create a table to use for "
            "validation." + os.linesep +
            "Please enter a name for this table: ",
            default="SOME_PROJECT.SOME_DATASET.ge_tmp_" +
            str(uuid.uuid4())[:8],
        )
        temp_table_kwargs = {
            "bigquery_temp_table": bigquery_temp_table,
        }

    # now building the actual batch_kwargs
    if sql_query is None:
        batch_kwargs = temp_generator.build_batch_kwargs(
            data_asset_name, **additional_batch_kwargs)
        batch_kwargs.update(temp_table_kwargs)
    else:
        batch_kwargs = {"query": sql_query, "datasource": datasource_name}
        batch_kwargs.update(temp_table_kwargs)
        BridgeValidator(
            batch=datasource.get_batch(batch_kwargs),
            expectation_suite=ExpectationSuite("throwaway"),
        ).get_dataset()

    batch_kwargs["data_asset_name"] = data_asset_name
    return data_asset_name, batch_kwargs