def run( self, method: str, left_check_table: Dict, right_check_table: Dict, result_table: Optional[Dict] = None, columns: Optional[List[str]] = None, time_filter: Optional[Union[str, List[Dict], TimeFilter]] = None, left_custom_sql: str = None, right_custom_sql: str = None, context: Optional[Dict] = None, example_selector: ExampleSelector = default_example_selector, ) -> Union[CheckResult, ConsistencyCheck]: if left_custom_sql and right_custom_sql: if columns or time_filter: raise ValueError( "When using custom sqls you cannot change 'columns' or 'time_filter' attribute" ) time_filter = parse_time_filter(time_filter) left_check_table = Table(**left_check_table) right_check_table = Table(**right_check_table) context = self.get_context(left_check_table, right_check_table, context) result = self.do_consistency_check( method, columns, time_filter, left_check_table, right_check_table, left_custom_sql, right_custom_sql, context, example_selector, ) if result_table: result_table = ResultTable(**result_table, model_cls=self.model_cls) quality_check_class = create_default_check_class(result_table) self.right_conn.ensure_table(quality_check_class.__table__) self.upsert(quality_check_class, result) return result obj = CheckResult() obj.init_row_consistency(**result) return obj
def test_direct_time_filter_usage(dummy_contessa, ctx): t = Table(**{"schema_name": "tmp", "table_name": "hello_world"}) e = SqlExecutor(t, dummy_contessa.conn, ctx) rule = NotNullRule( "not_null_name", "not_null", "src", time_filter=TimeFilter( columns=[ TimeFilterColumn("created_at", since=timedelta(days=10), until="now"), TimeFilterColumn("updated_at", since=timedelta(days=1)), ], conjunction=TimeFilterConjunction.AND, ), ) time_filter = e.compose_where_time_filter(rule) computed_created = (ctx["task_ts"] - timedelta(days=10)).strftime("%Y-%m-%d %H:%M:%S") computed_updated = (ctx["task_ts"] - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") expected = ( f"(created_at >= '{computed_created} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz) AND " f"(updated_at >= '{computed_updated} UTC'::timestamptz)") assert time_filter == expected, "TimeFilter type can be used directly"
def test_sql_apply_only_failures(conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( src text, dst text ); insert into public.tmp_table(src, dst) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) sql = """ select src from {{ table_fullname }} where src != 'aaa' """ rule = CustomSqlRule( "sql_test_name", "sql_test", "src", sql, "example description", only_failures_mode=True, ) results = rule.apply(conn) assert results.total_records == 0 assert results.failed == 1 assert results.passed == 0 conn.execute("""DROP TABLE tmp_table;""")
def test_rule_context_formatted_in_where(): class TestRule(SqlRule): @property def sql(self): return "select a, b, c from {{table_fullname}}_{{ts_nodash}}" r = TestRule( name="test_rule_name", type="test_rule_type", condition="created_at >= '{{ts_nodash}}'::timestamptz - interval '10 minutes'", description="Greater than 0 when bags <> 0", ) check_table = Table("raw", "table") context = ContessaRunner.get_context(check_table, {"ts_nodash": "20190101T000000"}) # executor holds context of run, so set it refresh_executors(check_table, "", context) result = r.sql_with_where expected = """ select a, b, c from raw.table_20190101T000000 where created_at >= '20190101T000000'::timestamptz - interval '10 minutes' """ assert normalize_str(result) == normalize_str(expected)
def test_sql_apply_extra_ctx(conn, ctx): ctx["dst_table"] = "public.dst_table" conn.execute(""" drop table if exists public.dst_table; create table public.dst_table( src text, dst text, created timestamptz ); insert into public.dst_table(src, dst, created) values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00') """) refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select src = 'aaa' from {{ dst_table }} where created between timestamptz '{{task_ts}}' and timestamptz '{{task_ts}}' + interval '60 seconds' """ rule = CustomSqlRule("sql_test_name", "sql_test", "col1", sql, "example description") results = rule.apply(conn) assert results.total_records == 2 assert results.failed == 1 assert results.passed == 1 conn.execute("""DROP TABLE public.dst_table;""")
def test_sql_apply(conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( src text, dst text ); insert into public.tmp_table(src, dst) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) sql = """ select src = 'aaa' from {{ table_fullname }} """ rule = CustomSqlRule("sql_test", sql, "example description") results = rule.apply(conn) expected = pd.Series([False, True]) assert list(expected) == list(results) conn.execute("""DROP TABLE tmp_table;""")
def test_sql_standard_formatting(conn, ctx): ctx["dst_table"] = "public.dst_table" conn.execute(""" drop table if exists public.dst_table; create table public.dst_table( src text, dst text, created timestamptz ); insert into public.dst_table(src, dst, created) values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00') """) refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select dst from {{ dst_table }} where dst LIKE 'a%' """ rule = CustomSqlRule( "sql_test_name", "sql_test", None, sql, "example description", only_failures_mode=True, ) results = rule.apply(conn) assert results.total_records == 0 # unknown because of only_failures_mode assert results.failed == 1 assert results.passed == 0
def test_compose_kwargs_sql_executor(dummy_contessa, ctx): t = Table(**{"schema_name": "tmp", "table_name": "hello_world"}) e = SqlExecutor(t, dummy_contessa.conn, ctx) rule = NotNullRule("not_null", "src", time_filter="created_at") kwargs = e.compose_kwargs(rule) expected = {"conn": dummy_contessa.conn} assert kwargs == expected
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Optional[ Dict] = None, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ) -> Union[CheckResult, QualityCheck]: check_table = Table(**check_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) if result_table: result_table = ResultTable(**result_table, model_cls=self.model_cls) quality_check_class = self.get_quality_check_class(result_table) self.conn.ensure_table(quality_check_class.__table__) else: quality_check_class = CheckResult rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) if result_table: self.conn.upsert(objs) return objs
def test_compose_kwargs_pd_executor(dummy_contessa, ctx): t = Table(**{"schema_name": "tmp", "table_name": "hello_world"}) e = PandasExecutor(t, dummy_contessa.conn, ctx) rule = NotNullRule("not_null", "src", time_filter="created_at") df = pd.DataFrame([{"created_at": datetime(2017, 10, 10)}]) e.conn.get_pandas_df = lambda x: df kwargs = e.compose_kwargs(rule) expected = {"df": df} assert kwargs.keys() == expected.keys()
def test_compose_kwargs_sql_executor_time_filter(dummy_contessa, ctx): t = Table(**{"schema_name": "tmp", "table_name": "hello_world"}) e = SqlExecutor(t, dummy_contessa.conn, ctx) rule = NotNullRule("not_null_name", "not_null", "src", time_filter="created_at") time_filter = e.compose_where_time_filter(rule) computed_datetime = (ctx["task_ts"] - timedelta(days=30)).strftime("%Y-%m-%d %H:%M:%S") expected = f"(created_at >= '{computed_datetime} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz)" assert time_filter == expected, "time_filter is string" rule = NotNullRule("not_null_name", "not_null", "src", time_filter=[{ "column": "created_at" }]) time_filter = e.compose_where_time_filter(rule) computed_datetime = (ctx["task_ts"] - timedelta(days=30)).strftime("%Y-%m-%d %H:%M:%S") expected = f"(created_at >= '{computed_datetime} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz)" assert time_filter == expected, "time_filter has only column" rule = NotNullRule( "not_null_name", "not_null", "src", time_filter=[ { "column": "created_at", "days": 10 }, { "column": "updated_at", "days": 1 }, ], ) time_filter = e.compose_where_time_filter(rule) computed_created = (ctx["task_ts"] - timedelta(days=10)).strftime("%Y-%m-%d %H:%M:%S") computed_updated = (ctx["task_ts"] - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") expected = ( f"(created_at >= '{computed_created} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz) OR " f"(updated_at >= '{computed_updated} UTC'::timestamptz AND updated_at < '{ctx['task_ts']} UTC'::timestamptz)" ) assert time_filter == expected, "time_filter has 2 members"
def test_new_rule(conn, ctx): class CountSqlRule(SqlRule): executor_cls = SqlExecutor def __init__(self, name, type, count, description=None, **kwargs): super().__init__(name, type, description=description, **kwargs) self.count = count def get_sql_parameters(self): context = super().get_sql_parameters() context.update({"target_count": self.count}) return context @property def sql(self): return """ SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}} """ conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( a text, b text ); insert into public.tmp_table(a, b) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) rule = CountSqlRule("count_name", "count", 2) results = rule.apply(conn) expected = AggregatedResult(total_records=1, failed=0, passed=1) assert (expected.failed, expected.passed) == (results.failed, results.passed) rule = CountSqlRule("count_name", "count", 2, condition="a = 'bts'") results = rule.apply(conn) expected = AggregatedResult(total_records=1, failed=1, passed=0) assert (expected.failed, expected.passed) == (results.failed, results.passed) conn.execute("""DROP TABLE tmp_table;""")
def test_new_rule(conn, ctx): class CountSqlRule(SqlRule): executor_cls = SqlExecutor def __init__(self, name, count, **kwargs): super().__init__(name, **kwargs) self.count = count def get_sql_parameters(self): context = super().get_sql_parameters() context.update({"target_count": self.count}) return context @property def sql(self): return """ SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}} """ conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( a text, b text ); insert into public.tmp_table(a, b) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) rule = CountSqlRule("count", 2) results = rule.apply(conn) expected = pd.Series([True]) assert list(expected) == list(results) rule = CountSqlRule("count", 2, condition="a = 'bts'") results = rule.apply(conn) expected = pd.Series([False]) assert list(expected) == list(results) conn.execute("""DROP TABLE tmp_table;""")
def test_one_column_rule_sql(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, value2 int ); insert into public.tmp_table(value, value2) values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_sql_missing_jinja_param(conn, ctx): ctx["dst_table"] = "public.dst_table" refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select dst from {{ dst_table }} where dst LIKE '{{ missing_pattern }}' """ rule = CustomSqlRule( "sql_test_name", "sql_test", None, sql, "example description", only_failures_mode=True, ) with pytest.raises(jinja2.exceptions.UndefinedError): rule.apply(conn)
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Dict, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ): check_table = Table(**check_table) result_table = ResultTable(**result_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) quality_check_class = self.get_quality_check_class(result_table) self.ensure_table(quality_check_class) rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) self.insert(objs)
def test_one_column_rule_sql_condition(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, conditional boolean, date timestamptz ); insert into public.tmp_table(value, conditional, date) values (1, TRUE, NULL), (4, TRUE, NULL), (5, FALSE, NULL), (NULL, FALSE, NULL), (4, FALSE, '2019-10-02T13:30:00+0020') """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_cmp_with_other_col_rules(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value1 int, value2 int, value3 int, value4 int ); insert into public.tmp_table(value1, value2, value3, value4) values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_one_column_rule_sql(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, value2 int ); insert into public.tmp_table(value, value2) values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) assert (expected.total_records, expected.failed, expected.passed) == ( results.total_records, results.failed, results.passed, )
def construct_automatic_time_filter( self, left_check_table: Dict, created_at_column=None, updated_at_column=None, ) -> TimeFilter: left_check_table = Table(**left_check_table) if created_at_column is None and updated_at_column is None: raise ValueError( "Automatic time filter need at least one time column") since_column = updated_at_column or created_at_column since_sql = f"SELECT min({since_column}) FROM {left_check_table.fullname}" logging.info(since_sql) since = self.left_conn.get_records(since_sql).scalar() return TimeFilter( columns=[ TimeFilterColumn(since_column, since=since), ], conjunction=TimeFilterConjunction.AND, )
def test_cmp_with_other_col_rules(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value1 int, value2 int, value3 int, value4 int ); insert into public.tmp_table(value1, value2, value3, value4) values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) assert (expected.total_records, expected.failed, expected.passed) == ( results.total_records, results.failed, results.passed, )
def setUpClass(cls) -> None: cls.conn = Connector(TEST_DB_URI) cls.alembic_cfg = Config(ALEMBIC_INI_PATH) migration_table_name = cls.alembic_cfg.get_main_option("version_table") cls.migration_table = Table(DATA_QUALITY_SCHEMA, migration_table_name)
def e(conn, ctx): class ConcreteExecutor(Executor): def compose_kwargs(self, rule): return {} return ConcreteExecutor(Table("tmp", "hello_world"), conn, ctx)