def test_sql_apply_only_failures(conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( src text, dst text ); insert into public.tmp_table(src, dst) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) sql = """ select src from {{ table_fullname }} where src != 'aaa' """ rule = CustomSqlRule( "sql_test_name", "sql_test", "src", sql, "example description", only_failures_mode=True, ) results = rule.apply(conn) assert results.total_records == 0 assert results.failed == 1 assert results.passed == 0 conn.execute("""DROP TABLE tmp_table;""")
def test_sql_standard_formatting(conn, ctx): ctx["dst_table"] = "public.dst_table" conn.execute(""" drop table if exists public.dst_table; create table public.dst_table( src text, dst text, created timestamptz ); insert into public.dst_table(src, dst, created) values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00') """) refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select dst from {{ dst_table }} where dst LIKE 'a%' """ rule = CustomSqlRule( "sql_test_name", "sql_test", None, sql, "example description", only_failures_mode=True, ) results = rule.apply(conn) assert results.total_records == 0 # unknown because of only_failures_mode assert results.failed == 1 assert results.passed == 0
def test_sql_apply_extra_ctx(conn, ctx): ctx["dst_table"] = "public.dst_table" conn.execute(""" drop table if exists public.dst_table; create table public.dst_table( src text, dst text, created timestamptz ); insert into public.dst_table(src, dst, created) values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00') """) refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select src = 'aaa' from {{ dst_table }} where created between timestamptz '{{task_ts}}' and timestamptz '{{task_ts}}' + interval '60 seconds' """ rule = CustomSqlRule("sql_test_name", "sql_test", "col1", sql, "example description") results = rule.apply(conn) assert results.total_records == 2 assert results.failed == 1 assert results.passed == 1 conn.execute("""DROP TABLE public.dst_table;""")
def test_sql_apply(conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( src text, dst text ); insert into public.tmp_table(src, dst) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) sql = """ select src = 'aaa' from {{ table_fullname }} """ rule = CustomSqlRule("sql_test", sql, "example description") results = rule.apply(conn) expected = pd.Series([False, True]) assert list(expected) == list(results) conn.execute("""DROP TABLE tmp_table;""")
def test_rule_context_formatted_in_where(): class TestRule(SqlRule): @property def sql(self): return "select a, b, c from {{table_fullname}}_{{ts_nodash}}" r = TestRule( name="test_rule_name", type="test_rule_type", condition="created_at >= '{{ts_nodash}}'::timestamptz - interval '10 minutes'", description="Greater than 0 when bags <> 0", ) check_table = Table("raw", "table") context = ContessaRunner.get_context(check_table, {"ts_nodash": "20190101T000000"}) # executor holds context of run, so set it refresh_executors(check_table, "", context) result = r.sql_with_where expected = """ select a, b, c from raw.table_20190101T000000 where created_at >= '20190101T000000'::timestamptz - interval '10 minutes' """ assert normalize_str(result) == normalize_str(expected)
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Optional[ Dict] = None, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ) -> Union[CheckResult, QualityCheck]: check_table = Table(**check_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) if result_table: result_table = ResultTable(**result_table, model_cls=self.model_cls) quality_check_class = self.get_quality_check_class(result_table) self.conn.ensure_table(quality_check_class.__table__) else: quality_check_class = CheckResult rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) if result_table: self.conn.upsert(objs) return objs
def test_new_rule(conn, ctx): class CountSqlRule(SqlRule): executor_cls = SqlExecutor def __init__(self, name, type, count, description=None, **kwargs): super().__init__(name, type, description=description, **kwargs) self.count = count def get_sql_parameters(self): context = super().get_sql_parameters() context.update({"target_count": self.count}) return context @property def sql(self): return """ SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}} """ conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( a text, b text ); insert into public.tmp_table(a, b) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) rule = CountSqlRule("count_name", "count", 2) results = rule.apply(conn) expected = AggregatedResult(total_records=1, failed=0, passed=1) assert (expected.failed, expected.passed) == (results.failed, results.passed) rule = CountSqlRule("count_name", "count", 2, condition="a = 'bts'") results = rule.apply(conn) expected = AggregatedResult(total_records=1, failed=1, passed=0) assert (expected.failed, expected.passed) == (results.failed, results.passed) conn.execute("""DROP TABLE tmp_table;""")
def test_new_rule(conn, ctx): class CountSqlRule(SqlRule): executor_cls = SqlExecutor def __init__(self, name, count, **kwargs): super().__init__(name, **kwargs) self.count = count def get_sql_parameters(self): context = super().get_sql_parameters() context.update({"target_count": self.count}) return context @property def sql(self): return """ SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}} """ conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( a text, b text ); insert into public.tmp_table(a, b) values ('bts', 'abc'), ('aaa', NULL) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) rule = CountSqlRule("count", 2) results = rule.apply(conn) expected = pd.Series([True]) assert list(expected) == list(results) rule = CountSqlRule("count", 2, condition="a = 'bts'") results = rule.apply(conn) expected = pd.Series([False]) assert list(expected) == list(results) conn.execute("""DROP TABLE tmp_table;""")
def test_one_column_rule_sql(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, value2 int ); insert into public.tmp_table(value, value2) values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_sql_missing_jinja_param(conn, ctx): ctx["dst_table"] = "public.dst_table" refresh_executors(Table("public", "dst_table"), conn, context=ctx) sql = """ select dst from {{ dst_table }} where dst LIKE '{{ missing_pattern }}' """ rule = CustomSqlRule( "sql_test_name", "sql_test", None, sql, "example description", only_failures_mode=True, ) with pytest.raises(jinja2.exceptions.UndefinedError): rule.apply(conn)
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Dict, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ): check_table = Table(**check_table) result_table = ResultTable(**result_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) quality_check_class = self.get_quality_check_class(result_table) self.ensure_table(quality_check_class) rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) self.insert(objs)
def test_one_column_rule_sql_condition(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, conditional boolean, date timestamptz ); insert into public.tmp_table(value, conditional, date) values (1, TRUE, NULL), (4, TRUE, NULL), (5, FALSE, NULL), (NULL, FALSE, NULL), (4, FALSE, '2019-10-02T13:30:00+0020') """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_cmp_with_other_col_rules(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value1 int, value2 int, value3 int, value4 int ); insert into public.tmp_table(value1, value2, value3, value4) values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) expected = pd.Series(expected, name=rule.column) assert list(expected) == list(results)
def test_one_column_rule_sql(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value int, value2 int ); insert into public.tmp_table(value, value2) values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) assert (expected.total_records, expected.failed, expected.passed) == ( results.total_records, results.failed, results.passed, )
def test_cmp_with_other_col_rules(rule, expected, conn, ctx): conn.execute(""" drop table if exists public.tmp_table; create table public.tmp_table( value1 int, value2 int, value3 int, value4 int ); insert into public.tmp_table(value1, value2, value3, value4) values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1) """) refresh_executors(Table(schema_name="public", table_name="tmp_table"), conn, context=ctx) results = rule.apply(conn) assert (expected.total_records, expected.failed, expected.passed) == ( results.total_records, results.failed, results.passed, )