def test_standalone_sqlalchemy_datasource(test_db_connection_string, sa):
    datasource = SqlAlchemyDatasource(
        "SqlAlchemy",
        connection_string=test_db_connection_string,
        echo=False,
        batch_kwargs_generators={
            "default": {
                "class_name": "TableBatchKwargsGenerator"
            }
        },
    )

    assert set(
        datasource.get_available_data_asset_names()["default"]["names"]) == {
            ("main.table_1", "table"),
            ("main.table_2", "table"),
        }
    batch_kwargs = datasource.build_batch_kwargs("default", "main.table_1")
    batch = datasource.get_batch(batch_kwargs=batch_kwargs)
    assert isinstance(batch, Batch)
    batch_data = batch.data
    assert isinstance(
        batch_data,
        great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference,
    )
    dataset = SqlAlchemyDataset(**batch.data.get_init_kwargs())
    assert len(dataset.head(10)) == 5
예제 #2
0
def test_sqlalchemy_source_limit(sqlitedb_engine):
    df1 = pd.DataFrame({
        "col_1": [1, 2, 3, 4, 5],
        "col_2": ["a", "b", "c", "d", "e"]
    })
    df2 = pd.DataFrame({
        "col_1": [0, 1, 2, 3, 4],
        "col_2": ["b", "c", "d", "e", "f"]
    })
    df1.to_sql("table_1", con=sqlitedb_engine, index=True)
    df2.to_sql("table_2", con=sqlitedb_engine, index=True, schema="main")
    datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine)
    limited_batch = datasource.get_batch({
        "table": "table_1",
        "limit": 1,
        "offset": 2
    })
    assert isinstance(limited_batch, Batch)
    limited_dataset = Validator(
        limited_batch,
        expectation_suite=ExpectationSuite("test"),
        expectation_engine=SqlAlchemyDataset,
    ).get_dataset()
    assert limited_dataset._table.name.startswith(
        "ge_tmp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)["col_1"][0] == 3  # offset should have been applied
예제 #3
0
def test_sqlalchemy_source_limit(sqlitedb_engine):
    df1 = pd.DataFrame({
        'col_1': [1, 2, 3, 4, 5],
        'col_2': ['a', 'b', 'c', 'd', 'e']
    })
    df2 = pd.DataFrame({
        'col_1': [0, 1, 2, 3, 4],
        'col_2': ['b', 'c', 'd', 'e', 'f']
    })
    df1.to_sql('table_1', con=sqlitedb_engine, index=True)
    df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main')
    datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine)
    limited_batch = datasource.get_batch({
        "table": "table_1",
        "limit": 1,
        "offset": 2
    })
    assert isinstance(limited_batch, Batch)
    limited_dataset = Validator(
        limited_batch,
        expectation_suite=ExpectationSuite("test"),
        expectation_engine=SqlAlchemyDataset).get_dataset()
    assert limited_dataset._table.name.startswith(
        "ge_tmp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)['col_1'][0] == 3  # offset should have been applied
def test_sqlalchemy_source_templating(sqlitedb_engine, empty_data_context):
    context: DataContext = empty_data_context
    datasource = SqlAlchemyDatasource(
        engine=sqlitedb_engine,
        batch_kwargs_generators={
            "foo": {
                "class_name": "QueryBatchKwargsGenerator"
            }
        },
    )
    generator = datasource.get_batch_kwargs_generator("foo")
    generator.add_query(data_asset_name="test",
                        query="select 'cat' as ${col_name};")
    batch = datasource.get_batch(
        generator.build_batch_kwargs(
            "test", query_parameters={"col_name": "animal_name"}))
    dataset = BridgeValidator(
        batch,
        expectation_suite=ExpectationSuite("test", data_context=context),
        expectation_engine=SqlAlchemyDataset,
    ).get_dataset()
    res = dataset.expect_column_to_exist("animal_name")
    assert res.success is True
    res = dataset.expect_column_values_to_be_in_set("animal_name", ["cat"])
    assert res.success is True
예제 #5
0
def test_sqlalchemysource_templating(sqlitedb_engine):
    datasource = SqlAlchemyDatasource(engine=sqlitedb_engine)
    generator = datasource.get_generator()
    generator.add_query("test", "select 'cat' as ${col_name};")
    df = datasource.get_batch("test", col_name="animal_name")
    res = df.expect_column_to_exist("animal_name")
    assert res["success"] == True
def test_standalone_sqlalchemy_datasource(test_db_connection_string):
    datasource = SqlAlchemyDatasource(
        'SqlAlchemy', connection_string=test_db_connection_string, echo=False)

    assert datasource.get_available_data_asset_names() == {"default": {"main.table_1", "main.table_2"}}
    dataset1 = datasource.get_batch("main.table_1")
    dataset2 = datasource.get_batch("main.table_2")
    assert isinstance(dataset1, SqlAlchemyDataset)
    assert isinstance(dataset2, SqlAlchemyDataset)
def test_query_generator_view(sqlite_view_engine):
    datasource = SqlAlchemyDatasource(
        engine=sqlite_view_engine,
        batch_kwargs_generators={"table": {"class_name": "TableBatchKwargsGenerator"}},
    )  # Build a datasource with a queries generator to introspect our database with a view
    names = set(datasource.get_available_data_asset_names()["table"]["names"])

    # We should see both the table *and* the primary view, but *not* the temp view
    assert names == {("main.test_table", "table"), ("main.test_view", "view")}
def test_query_generator_view(sqlite_view_engine):
    datasource = SqlAlchemyDatasource(
        engine=sqlite_view_engine, generators={"query": {
            "type": "queries"
        }}
    )  # Build a datasource with a queries generator to introspect our database with a view
    names = set(datasource.get_available_data_asset_names()["query"])

    # We should see both the table *and* the primary view, but *not* the temp view
    assert names == {"main.test_table", "main.test_view"}
예제 #9
0
def test_sqlalchemy_source_templating(sqlitedb_engine):
    datasource = SqlAlchemyDatasource(
        engine=sqlitedb_engine,
        generators={"foo": {
            "class_name": "QueryGenerator"
        }})
    generator = datasource.get_generator("foo")
    generator.add_query("test", "select 'cat' as ${col_name};")
    df = datasource.get_batch(
        "test", "my_suite",
        generator.yield_batch_kwargs("test", col_name="animal_name"))
    res = df.expect_column_to_exist("animal_name")
    assert res["success"] is True
def test_sqlalchemy_datasource_query_and_table_handling(sqlitedb_engine):
    # MANUALLY SET DIALECT NAME FOR TEST
    datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine)
    with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__",
                    return_value=None) as mock_batch:
        datasource.get_batch({
            "query": "select * from foo;"
        })
    mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;", table_name=None)

    # Normally, we do not allow both query and table_name
    with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__",
                    return_value=None) as mock_batch:
        datasource.get_batch({
            "query": "select * from foo;",
            "table_name": "bar"
        })
    mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;", table_name=None)

    # Snowflake should allow *both* query *and* table_name
    sqlitedb_engine.dialect.name = "snowflake"
    with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__",
                    return_value=None) as mock_batch:
        datasource.get_batch({
            "query": "select * from foo;",
            "table_name": "bar"
        })
    mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;",
                                       table_name="bar")
def test_sqlalchemy_source_templating(sqlitedb_engine):
    datasource = SqlAlchemyDatasource(engine=sqlitedb_engine, generators={
        "foo": {
            "class_name": "QueryBatchKwargsGenerator"
        }
    })
    generator = datasource.get_generator("foo")
    generator.add_query("test", "select 'cat' as ${col_name};")
    batch = datasource.get_batch(generator.build_batch_kwargs("test", query_parameters={'col_name': "animal_name"}))
    dataset = Validator(batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset()
    res = dataset.expect_column_to_exist("animal_name")
    assert res.success is True
    res = dataset.expect_column_values_to_be_in_set('animal_name', ['cat'])
    assert res.success is True
예제 #12
0
def test_sqlalchemy_datasource_processes_dataset_options(
        test_db_connection_string):
    datasource = SqlAlchemyDatasource(
        "SqlAlchemy", credentials={"url": test_db_connection_string})
    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": False})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False

    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": True})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is True

    batch_kwargs = {
        "query": "select * from table_1;",
        "dataset_options": {
            "caching": False
        },
    }
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
예제 #13
0
def test_sqlalchemy_source_limit(sqlitedb_engine):
    df1 = pd.DataFrame({
        'col_1': [1, 2, 3, 4, 5],
        'col_2': ['a', 'b', 'c', 'd', 'e']
    })
    df2 = pd.DataFrame({
        'col_1': [0, 1, 2, 3, 4],
        'col_2': ['b', 'c', 'd', 'e', 'f']
    })
    df1.to_sql('table_1', con=sqlitedb_engine, index=True)
    df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main')
    datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine)
    limited_dataset = datasource.get_data_asset("table_1",
                                                "default",
                                                limit=1,
                                                offset=2)
    assert isinstance(limited_dataset, SqlAlchemyDataset)
    assert limited_dataset._table.name.startswith(
        "ge_tmp_")  # we have generated a temporary table
    assert len(limited_dataset.head(10)) == 1  # and it is only one row long
    assert limited_dataset.head(
        10)['col_1'][0] == 3  # offset should have been applied
예제 #14
0
def _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True):

    msg_success_database = (
        "\n<green>Great Expectations connected to your database!</green>"
    )

    if not _verify_sqlalchemy_dependent_modules():
        return None

    db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))]
    selected_database = (
        int(
            click.prompt(
                msg_prompt_choose_database,
                type=click.Choice(db_choices),
                show_choices=False,
            )
        )
        - 1
    )  # don't show user a zero index list :)

    selected_database = list(SupportedDatabases)[selected_database]

    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "sqlalchemy", "db": selected_database.name},
        success=True,
    )

    datasource_name = "my_{}_db".format(selected_database.value.lower())
    if selected_database == SupportedDatabases.OTHER:
        datasource_name = "my_database"
    if prompt_for_datasource_name:
        datasource_name = click.prompt(
            msg_prompt_datasource_name, default=datasource_name
        )

    credentials = {}
    # Since we don't want to save the database credentials in the config file that will be
    # committed in the repo, we will use our Variable Substitution feature to store the credentials
    # in the credentials file (that will not be committed, since it is in the uncommitted directory)
    # with the datasource's name as the variable name.
    # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will
    # be ${datasource name}.
    # Great Expectations will replace the ${datasource name} with the value from the credentials file in runtime.

    while True:
        cli_message(msg_db_config.format(datasource_name))

        if selected_database == SupportedDatabases.MYSQL:
            if not _verify_mysql_dependent_modules():
                return None

            credentials = _collect_mysql_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.POSTGRES:
            if not _verify_postgresql_dependent_modules():
                return None

            credentials = _collect_postgres_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.REDSHIFT:
            if not _verify_redshift_dependent_modules():
                return None

            credentials = _collect_redshift_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.SNOWFLAKE:
            if not _verify_snowflake_dependent_modules():
                return None

            credentials = _collect_snowflake_credentials(
                default_credentials=credentials
            )
        elif selected_database == SupportedDatabases.BIGQUERY:
            if not _verify_bigquery_dependent_modules():
                return None

            credentials = _collect_bigquery_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.OTHER:
            sqlalchemy_url = click.prompt(
                """What is the url/connection string for the sqlalchemy connection?
(reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls)
""",
                show_default=False,
            ).strip()
            credentials = {"url": sqlalchemy_url}

        context.save_config_variable(datasource_name, credentials)

        message = """
<red>Cannot connect to the database.</red>
  - Please check your environment and the configuration you provided.
  - Database Error: {0:s}"""
        try:
            cli_message(
                "<cyan>Attempting to connect to your database. This may take a moment...</cyan>"
            )

            configuration = SqlAlchemyDatasource.build_configuration(
                credentials="${" + datasource_name + "}"
            )

            configuration["class_name"] = "SqlAlchemyDatasource"
            configuration["module_name"] = "great_expectations.datasource"
            errors = DatasourceConfigSchema().validate(configuration)
            if len(errors) != 0:
                raise ge_exceptions.GreatExpectationsError(
                    "Invalid Datasource configuration: {:s}".format(errors)
                )

            cli_message(
                """
Great Expectations will now add a new Datasource '{0:s}' to your deployment, by adding this entry to your great_expectations.yml:

{1:s}
The credentials will be saved in uncommitted/config_variables.yml under the key '{0:s}'
""".format(
                    datasource_name,
                    textwrap.indent(
                        toolkit.yaml.dump({datasource_name: configuration}), "  "
                    ),
                )
            )

            toolkit.confirm_proceed_or_exit()
            context.add_datasource(name=datasource_name, **configuration)
            cli_message(msg_success_database)
            break
        except ModuleNotFoundError as de:
            cli_message(message.format(str(de)))
            return None

        except DatasourceInitializationError as de:
            cli_message(message.format(str(de)))
            if not click.confirm("Enter the credentials again?", default=True):
                context.add_datasource(
                    datasource_name,
                    initialize=False,
                    module_name="great_expectations.datasource",
                    class_name="SqlAlchemyDatasource",
                    data_asset_type={"class_name": "SqlAlchemyDataset"},
                    credentials="${" + datasource_name + "}",
                )
                # TODO this message about continuing may not be accurate
                cli_message(
                    """
We saved datasource {:s} in {:s} and the credentials you entered in {:s}.
Since we could not connect to the database, you can complete troubleshooting in the configuration files documented in the how-to guides here:
<blue>https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_datasources.html?utm_source=cli&utm_medium=init&utm_campaign={:s}#{:s}</blue> .

After you connect to the datasource, run great_expectations init to continue.

""".format(
                        datasource_name,
                        DataContext.GE_YML,
                        context.get_config()["config_variables_file_path"],
                        rtd_url_ge_version,
                        selected_database.value.lower(),
                    )
                )
                return None

    return datasource_name
예제 #15
0
def postgresql_sqlalchemy_datasource(postgresql_engine):
    return SqlAlchemyDatasource("postgresql_sqlalchemy_datasource",
                                engine=postgresql_engine)
예제 #16
0
def basic_sqlalchemy_datasource(sqlitedb_engine):
    return SqlAlchemyDatasource("basic_sqlalchemy_datasource",
                                engine=sqlitedb_engine)
예제 #17
0
def _add_sqlalchemy_datasource(context):
    if not load_library("sqlalchemy"):
        return None

    db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))]
    selected_database = int(
        click.prompt(
            msg_prompt_choose_database,
            type=click.Choice(db_choices),
            show_choices=False)) - 1  # don't show user a zero index list :)

    selected_database = list(SupportedDatabases)[selected_database]

    data_source_name = click.prompt(msg_prompt_datasource_name,
                                    default="my_{}_db".format(
                                        selected_database.value.lower()),
                                    show_default=True)

    credentials = {}
    # Since we don't want to save the database credentials in the config file that will be
    # committed in the repo, we will use our Variable Substitution feature to store the credentials
    # in the credentials file (that will not be committed, since it is in the uncommitted directory)
    # with the datasource's name as the variable name.
    # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will
    # be ${datasource name}.
    # GE will replace the ${datasource name} with the value from the credentials file in runtime.

    while True:
        cli_message(msg_db_config.format(data_source_name))

        if selected_database == SupportedDatabases.MYSQL:
            if not load_library("pymysql"):
                return None
            credentials = _collect_mysql_credentials(
                default_credentials=credentials)
        elif selected_database == SupportedDatabases.POSTGRES:
            credentials = _collect_postgres_credentials(
                default_credentials=credentials)
        elif selected_database == SupportedDatabases.REDSHIFT:
            if not load_library("psycopg2"):
                return None
            credentials = _collect_redshift_credentials(
                default_credentials=credentials)
        elif selected_database == SupportedDatabases.SNOWFLAKE:
            if not load_library("snowflake",
                                install_instructions_string=
                                "pip install snowflake-sqlalchemy"):
                return None
            credentials = _collect_snowflake_credentials(
                default_credentials=credentials)
        elif selected_database == SupportedDatabases.OTHER:
            sqlalchemy_url = click.prompt(
                """What is the url/connection string for the sqlalchemy connection?
(reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls)
""",
                show_default=False)
            credentials = {"url": sqlalchemy_url}

        context.save_config_variable(data_source_name, credentials)

        message = """
<red>Cannot connect to the database.</red>
  - Please check your environment and the configuration you provided.
  - Database Error: {0:s}"""
        try:
            configuration = SqlAlchemyDatasource.build_configuration(
                credentials="${" + data_source_name + "}")
            context.add_datasource(name=data_source_name,
                                   class_name='SqlAlchemyDatasource',
                                   **configuration)
            break
        except ModuleNotFoundError as de:
            cli_message(message.format(str(de)))
            return None

        except DatasourceInitializationError as de:
            cli_message(message.format(str(de)))
            if not click.confirm("Enter the credentials again?".format(
                    str(de)),
                                 default=True):
                context.add_datasource(
                    data_source_name,
                    initialize=False,
                    module_name="great_expectations.datasource",
                    class_name="SqlAlchemyDatasource",
                    data_asset_type={"class_name": "SqlAlchemyDataset"},
                    credentials="${" + data_source_name + "}",
                    generators={"default": {
                        "class_name": "TableGenerator"
                    }})
                cli_message("""
We saved datasource {0:s} in {1:s} and the credentials you entered in {2:s}.
Since we could not connect to the database, you can complete troubleshooting in the configuration files. Read here:
<blue>https://docs.greatexpectations.io/en/latest/tutorials/add-sqlalchemy-datasource.html?utm_source=cli&utm_medium=init&utm_campaign={3:s}#{4:s}</blue> .

After you connect to the datasource, run great_expectations profile to continue.

""".format(data_source_name, DataContext.GE_YML,
                context.get_project_config().get("config_variables_file_path"),
                rtd_url_ge_version, selected_database.value.lower()))
                return None

    return data_source_name