def test_instantiation_via_url_and_retrieve_data_with_other_dialect(sa): """Ensure that we can still retrieve data when the dialect is not recognized.""" # 1. Create engine with sqlite db db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file) assert my_execution_engine.connection_string is None assert my_execution_engine.credentials is None assert my_execution_engine.url[ -36:] == "test_cases_for_sql_data_connector.db" # 2. Change dialect to one not listed in GESqlDialect my_execution_engine.engine.dialect.name = "other_dialect" # 3. Get data num_rows_in_sample: int = 10 batch_data, _ = my_execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( table_name="table_partitioned_by_date_column__A", sampling_method="_sample_using_limit", sampling_kwargs={"n": num_rows_in_sample}, )) # 4. Assert dialect and data are as expected assert batch_data.dialect == GESqlDialect.OTHER my_execution_engine.load_batch_data("__", batch_data) validator = Validator(my_execution_engine) assert len(validator.head(fetch_all=True)) == num_rows_in_sample
def _build_sa_engine(df, sa): eng = sa.create_engine("sqlite://", echo=False) df.to_sql("test", eng, index=False) engine = SqlAlchemyExecutionEngine(engine=eng) batch_data = SqlAlchemyBatchData(execution_engine=engine, table_name="test") engine.load_batch_data("my_id", batch_data) return engine
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(postgresql_engine): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) df.to_sql( name="z_score_test_data", con=postgresql_engine, index=False, if_exists="replace", ) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = SqlAlchemyExecutionEngine(engine=postgresql_engine) engine.load_batch_data( "my_id", SqlAlchemyBatchData(execution_engine=engine, table_name="z_score_test_data"), ) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def test_instantiation_via_connection_string(sa, test_db_connection_string): my_execution_engine = SqlAlchemyExecutionEngine( connection_string=test_db_connection_string) assert my_execution_engine.connection_string == test_db_connection_string assert my_execution_engine.credentials == None assert my_execution_engine.url == None my_execution_engine.get_batch_data_and_markers( BatchSpec( table_name="main.table_1", sampling_method="_sample_using_limit", sampling_kwargs={"n": 5}, ))
def test_cases_for_sql_data_connector_sqlite_execution_engine(sa): if sa is None: raise ValueError( "SQL Database tests require sqlalchemy to be installed.") db_file_path: str = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) engine: sa.engine.Engine = sa.create_engine( get_sqlite_connection_url(db_file_path)) conn: sa.engine.Connection = engine.connect() raw_connection = engine.raw_connection() raw_connection.create_function("sqrt", 1, lambda x: math.sqrt(x)) raw_connection.create_function( "md5", 2, lambda x, d: hashlib.md5(str(x).encode("utf-8")).hexdigest()[-1 * d:]) # Build a SqlAlchemyDataset using that database return SqlAlchemyExecutionEngine( name="test_sql_execution_engine", engine=conn, )
def test_map_of_type_sa(sa): eng = sa.create_engine("sqlite://") df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) df.to_sql("test", eng, index=False) batch_data = SqlAlchemyBatchData(engine=eng, table_name="test") engine = SqlAlchemyExecutionEngine(engine=eng, batch_data_dict={"my_id": batch_data}) desired_metric = MetricConfiguration( metric_name="table.column_types", metric_domain_kwargs={}, metric_value_kwargs={}, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, )) assert results[desired_metric.id][0]["name"] == "a" assert isinstance(results[desired_metric.id][0]["type"], sa.FLOAT)
def _build_sa_engine(df, sa): eng = sa.create_engine("sqlite://", echo=False) df.to_sql("test", eng, index=False) batch_data = SqlAlchemyBatchData(engine=eng, table_name="test") engine = SqlAlchemyExecutionEngine(engine=eng, batch_data_dict={"my_id": batch_data}) return engine
def ge_validator_sqlalchemy() -> Validator: validator = Validator( execution_engine=SqlAlchemyExecutionEngine( connection_string="postgresql://localhost:5432/test"), batches=[ Batch( data=None, batch_request=BatchRequest( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", ), batch_definition=BatchDefinition( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", batch_identifiers=IDDict(), ), batch_spec=SqlAlchemyDatasourceBatchSpec({ "data_asset_name": "foo2", "table_name": "foo2", "batch_identifiers": {}, "schema_name": "public", "type": "table", }), ) ], ) return validator
def inner_func( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): filter_column_isnull = kwargs.get( "filter_column_isnull", getattr(cls, "filter_column_isnull", False)) if filter_column_isnull: compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs) else: # We do not copy here because if compute domain is different, it will be copied by get_compute_domain compute_domain_kwargs = metric_domain_kwargs ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=domain_type) column_name: str = accessor_domain_kwargs["column"] sqlalchemy_engine: sa.engine.Engine = execution_engine.engine if column_name not in metrics["table.columns"]: raise ge_exceptions.InvalidMetricAccessorDomainKwargsKeyError( message= f'Error: The column "{column_name}" in BatchData does not exist.' ) dialect = sqlalchemy_engine.dialect metric_aggregate = metric_fn( cls, column=sa.column(column_name), **metric_value_kwargs, _dialect=dialect, _table=selectable, _column_name=column_name, _sqlalchemy_engine=sqlalchemy_engine, _metrics=metrics, ) return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
def test_instantiation_via_url(sa): db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file) assert my_execution_engine.connection_string is None assert my_execution_engine.credentials is None assert my_execution_engine.url[-36:] == "test_cases_for_sql_data_connector.db" my_execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( table_name="table_partitioned_by_date_column__A", sampling_method="_sample_using_limit", sampling_kwargs={"n": 5}, ) )
def test_get_batch_data_and_markers_using_query(sqlite_view_engine, test_df): my_execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine( engine=sqlite_view_engine) test_df.to_sql("test_table_0", con=my_execution_engine.engine) query: str = "SELECT * FROM test_table_0" batch_data, batch_markers = my_execution_engine.get_batch_data_and_markers( batch_spec=RuntimeQueryBatchSpec(query=query, )) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2 assert batch_markers.get("ge_load_time") is not None
def test_cases_for_sql_data_connector_sqlite_execution_engine(sa): if sa is None: raise ValueError( "SQL Database tests require sqlalchemy to be installed.") db_file_path: str = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) engine: sa.engine.Engine = sa.create_engine( get_sqlite_connection_url(db_file_path)) conn: sa.engine.Connection = engine.connect() # Build a SqlAlchemyDataset using that database return SqlAlchemyExecutionEngine( name="test_sql_execution_engine", engine=conn, )
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( metric_domain_kwargs, MetricDomainTypes.TABLE ) # get the all column names of table using existing metric. columns = metrics.get("table.columns") ignore_columns = runtime_configuration["ignore_columns"] dialect_name = execution_engine.engine.dialect.name selectcolumns = select_column_list(columns, ignore_columns) if execution_engine.engine.dialect.name == "sqlite": cksumquery = get_sqlite_checksum_query( selectable.name, selectcolumns, ignore_columns ) elif dialect_name == "bigquery": cksumquery = get_bigquery_checksum_query( selectable.name, selectcolumns, ignore_columns ) else: logger.error("sql dialect is not supported: " + dialect_name) return 0 if DEBUG: logger.error("\n***********cksumquery***********\n" + cksumquery) return int(execution_engine.engine.execute(cksumquery).scalar())
def test_instantiation_via_credentials(sa, test_backends, test_df): if "postgresql" not in test_backends: pytest.skip( "test_database_store_backend_get_url_for_key requires postgresql") my_execution_engine = SqlAlchemyExecutionEngine( credentials={ "drivername": "postgresql", "username": "******", "password": "", "host": os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost"), "port": "5432", "database": "test_ci", }) assert my_execution_engine.connection_string is None assert my_execution_engine.credentials == { "username": "******", "password": "", "host": os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost"), "port": "5432", "database": "test_ci", } assert my_execution_engine.url is None
def test_sample_using_random(sqlite_view_engine, test_df): my_execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine( engine=sqlite_view_engine) p: float batch_spec: SqlAlchemyDatasourceBatchSpec batch_data: SqlAlchemyBatchData num_rows: int rows_0: List[tuple] rows_1: List[tuple] # First, make sure that degenerative case never passes. test_df_0: pd.DataFrame = test_df.iloc[:1] test_df_0.to_sql("test_table_0", con=my_execution_engine.engine) p = 1.0 batch_spec = SqlAlchemyDatasourceBatchSpec( table_name="test_table_0", schema_name="main", sampling_method="_sample_using_random", sampling_kwargs={"p": p}, ) batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec) num_rows = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.func.count() ]).select_from(batch_data.selectable)).scalar() assert num_rows == round(p * test_df_0.shape[0]) rows_0: List[tuple] = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.text("*") ]).select_from(batch_data.selectable)).fetchall() batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec) num_rows = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.func.count() ]).select_from(batch_data.selectable)).scalar() assert num_rows == round(p * test_df_0.shape[0]) rows_1: List[tuple] = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.text("*") ]).select_from(batch_data.selectable)).fetchall() assert len(rows_0) == len(rows_1) == 1 assert rows_0 == rows_1 # Second, verify that realistic case always returns different random sample of rows. test_df_1: pd.DataFrame = test_df test_df_1.to_sql("test_table_1", con=my_execution_engine.engine) p = 2.0e-1 batch_spec = SqlAlchemyDatasourceBatchSpec( table_name="test_table_1", schema_name="main", sampling_method="_sample_using_random", sampling_kwargs={"p": p}, ) batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec) num_rows = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.func.count() ]).select_from(batch_data.selectable)).scalar() assert num_rows == round(p * test_df_1.shape[0]) rows_0 = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.text("*") ]).select_from(batch_data.selectable)).fetchall() batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec) num_rows = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.func.count() ]).select_from(batch_data.selectable)).scalar() assert num_rows == round(p * test_df_1.shape[0]) rows_1 = batch_data.execution_engine.engine.execute( sqlalchemy.select([sqlalchemy.text("*") ]).select_from(batch_data.selectable)).fetchall() assert len(rows_0) == len(rows_1) assert not (rows_0 == rows_1)
def test_instantiation_error_states(sa, test_db_connection_string): with pytest.raises(ge_exceptions.InvalidConfigError): SqlAlchemyExecutionEngine()
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") dialect_name = execution_engine.engine.dialect.name.lower() if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = sa.column(accessor_domain_kwargs["column"]) if min_value is None: if strict_max: condition = column < max_value else: condition = column <= max_value elif max_value is None: if strict_min: condition = column > min_value else: condition = column >= min_value else: if strict_min and strict_max: condition = sa.and_(column > min_value, column < max_value) elif strict_min: condition = sa.and_(column > min_value, column <= max_value) elif strict_max: condition = sa.and_(column >= min_value, column < max_value) else: condition = sa.and_(column >= min_value, column <= max_value) return execution_engine.engine.execute( sa.select([sa.func.count() ]).select_from(selectable).where(condition)).scalar()