예제 #1
0
    def run(
        self,
        method: str,
        left_check_table: Dict,
        right_check_table: Dict,
        result_table: Optional[Dict] = None,
        columns: Optional[List[str]] = None,
        time_filter: Optional[Union[str, List[Dict], TimeFilter]] = None,
        left_custom_sql: str = None,
        right_custom_sql: str = None,
        context: Optional[Dict] = None,
        example_selector: ExampleSelector = default_example_selector,
    ) -> Union[CheckResult, ConsistencyCheck]:
        if left_custom_sql and right_custom_sql:
            if columns or time_filter:
                raise ValueError(
                    "When using custom sqls you cannot change 'columns' or 'time_filter' attribute"
                )

        time_filter = parse_time_filter(time_filter)

        left_check_table = Table(**left_check_table)
        right_check_table = Table(**right_check_table)
        context = self.get_context(left_check_table, right_check_table,
                                   context)

        result = self.do_consistency_check(
            method,
            columns,
            time_filter,
            left_check_table,
            right_check_table,
            left_custom_sql,
            right_custom_sql,
            context,
            example_selector,
        )

        if result_table:
            result_table = ResultTable(**result_table,
                                       model_cls=self.model_cls)
            quality_check_class = create_default_check_class(result_table)
            self.right_conn.ensure_table(quality_check_class.__table__)
            self.upsert(quality_check_class, result)
            return result

        obj = CheckResult()
        obj.init_row_consistency(**result)
        return obj
예제 #2
0
def test_direct_time_filter_usage(dummy_contessa, ctx):
    t = Table(**{"schema_name": "tmp", "table_name": "hello_world"})
    e = SqlExecutor(t, dummy_contessa.conn, ctx)

    rule = NotNullRule(
        "not_null_name",
        "not_null",
        "src",
        time_filter=TimeFilter(
            columns=[
                TimeFilterColumn("created_at",
                                 since=timedelta(days=10),
                                 until="now"),
                TimeFilterColumn("updated_at", since=timedelta(days=1)),
            ],
            conjunction=TimeFilterConjunction.AND,
        ),
    )
    time_filter = e.compose_where_time_filter(rule)
    computed_created = (ctx["task_ts"] -
                        timedelta(days=10)).strftime("%Y-%m-%d %H:%M:%S")
    computed_updated = (ctx["task_ts"] -
                        timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
    expected = (
        f"(created_at >= '{computed_created} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz) AND "
        f"(updated_at >= '{computed_updated} UTC'::timestamptz)")
    assert time_filter == expected, "TimeFilter type can be used directly"
예제 #3
0
def test_sql_apply_only_failures(conn, ctx):
    conn.execute("""
        drop table if exists public.tmp_table;

        create table public.tmp_table(
          src text,
          dst text
        );

        insert into public.tmp_table(src, dst)
        values ('bts', 'abc'), ('aaa', NULL)
    """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    sql = """
        select src
        from {{ table_fullname }}
        where src != 'aaa'
    """
    rule = CustomSqlRule(
        "sql_test_name",
        "sql_test",
        "src",
        sql,
        "example description",
        only_failures_mode=True,
    )
    results = rule.apply(conn)
    assert results.total_records == 0
    assert results.failed == 1
    assert results.passed == 0
    conn.execute("""DROP TABLE tmp_table;""")
예제 #4
0
def test_rule_context_formatted_in_where():
    class TestRule(SqlRule):
        @property
        def sql(self):
            return "select a, b, c from {{table_fullname}}_{{ts_nodash}}"

    r = TestRule(
        name="test_rule_name",
        type="test_rule_type",
        condition="created_at >= '{{ts_nodash}}'::timestamptz - interval '10 minutes'",
        description="Greater than 0 when bags <> 0",
    )
    check_table = Table("raw", "table")
    context = ContessaRunner.get_context(check_table, {"ts_nodash": "20190101T000000"})

    # executor holds context of run, so set it
    refresh_executors(check_table, "", context)

    result = r.sql_with_where
    expected = """
		select a, b, c
		from raw.table_20190101T000000
		where created_at >= '20190101T000000'::timestamptz - interval '10 minutes'
	"""
    assert normalize_str(result) == normalize_str(expected)
예제 #5
0
def test_sql_apply_extra_ctx(conn, ctx):
    ctx["dst_table"] = "public.dst_table"
    conn.execute("""
        drop table if exists public.dst_table;

        create table public.dst_table(
          src text,
          dst text,
          created timestamptz
        );

        insert into public.dst_table(src, dst, created)
        values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00')
    """)
    refresh_executors(Table("public", "dst_table"), conn, context=ctx)

    sql = """
        select
        src = 'aaa'
        from {{ dst_table }}
        where created between timestamptz '{{task_ts}}' and timestamptz '{{task_ts}}' + interval '60 seconds'
    """
    rule = CustomSqlRule("sql_test_name", "sql_test", "col1", sql,
                         "example description")
    results = rule.apply(conn)
    assert results.total_records == 2
    assert results.failed == 1
    assert results.passed == 1
    conn.execute("""DROP TABLE public.dst_table;""")
예제 #6
0
def test_sql_apply(conn, ctx):
    conn.execute("""
        drop table if exists public.tmp_table;

        create table public.tmp_table(
          src text,
          dst text
        );

        insert into public.tmp_table(src, dst)
        values ('bts', 'abc'), ('aaa', NULL)
    """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    sql = """
        select
        src = 'aaa'
        from {{ table_fullname }}
    """
    rule = CustomSqlRule("sql_test", sql, "example description")
    results = rule.apply(conn)
    expected = pd.Series([False, True])
    assert list(expected) == list(results)
    conn.execute("""DROP TABLE tmp_table;""")
예제 #7
0
def test_sql_standard_formatting(conn, ctx):
    ctx["dst_table"] = "public.dst_table"
    conn.execute("""
        drop table if exists public.dst_table;

        create table public.dst_table(
          src text,
          dst text,
          created timestamptz
        );

        insert into public.dst_table(src, dst, created)
        values ('bts', 'abc', '2018-09-12T12:00:00'), ('aaa', NULL, '2018-09-12T12:00:00'), ('aaa', NULL, '2019-07-31T12:00:00')
    """)
    refresh_executors(Table("public", "dst_table"), conn, context=ctx)

    sql = """
        select dst
        from {{ dst_table }}
        where dst LIKE 'a%'
    """
    rule = CustomSqlRule(
        "sql_test_name",
        "sql_test",
        None,
        sql,
        "example description",
        only_failures_mode=True,
    )
    results = rule.apply(conn)
    assert results.total_records == 0  # unknown because of only_failures_mode
    assert results.failed == 1
    assert results.passed == 0
예제 #8
0
def test_compose_kwargs_sql_executor(dummy_contessa, ctx):
    t = Table(**{"schema_name": "tmp", "table_name": "hello_world"})
    e = SqlExecutor(t, dummy_contessa.conn, ctx)
    rule = NotNullRule("not_null", "src", time_filter="created_at")
    kwargs = e.compose_kwargs(rule)
    expected = {"conn": dummy_contessa.conn}
    assert kwargs == expected
예제 #9
0
    def run(
        self,
        raw_rules: List[Dict[str, str]],
        check_table: Dict,
        result_table: Optional[
            Dict] = None,  # todo - docs for quality name, maybe defaults..
        context: Optional[Dict] = None,
    ) -> Union[CheckResult, QualityCheck]:
        check_table = Table(**check_table)
        context = self.get_context(check_table, context)

        normalized_rules = self.normalize_rules(raw_rules)
        refresh_executors(check_table, self.conn, context)

        if result_table:
            result_table = ResultTable(**result_table,
                                       model_cls=self.model_cls)
            quality_check_class = self.get_quality_check_class(result_table)
            self.conn.ensure_table(quality_check_class.__table__)
        else:
            quality_check_class = CheckResult

        rules = self.build_rules(normalized_rules)
        objs = self.do_quality_checks(quality_check_class, rules, context)

        if result_table:
            self.conn.upsert(objs)
        return objs
예제 #10
0
def test_compose_kwargs_pd_executor(dummy_contessa, ctx):
    t = Table(**{"schema_name": "tmp", "table_name": "hello_world"})
    e = PandasExecutor(t, dummy_contessa.conn, ctx)
    rule = NotNullRule("not_null", "src", time_filter="created_at")
    df = pd.DataFrame([{"created_at": datetime(2017, 10, 10)}])
    e.conn.get_pandas_df = lambda x: df
    kwargs = e.compose_kwargs(rule)
    expected = {"df": df}
    assert kwargs.keys() == expected.keys()
예제 #11
0
def test_compose_kwargs_sql_executor_time_filter(dummy_contessa, ctx):
    t = Table(**{"schema_name": "tmp", "table_name": "hello_world"})
    e = SqlExecutor(t, dummy_contessa.conn, ctx)

    rule = NotNullRule("not_null_name",
                       "not_null",
                       "src",
                       time_filter="created_at")
    time_filter = e.compose_where_time_filter(rule)
    computed_datetime = (ctx["task_ts"] -
                         timedelta(days=30)).strftime("%Y-%m-%d %H:%M:%S")
    expected = f"(created_at >= '{computed_datetime} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz)"
    assert time_filter == expected, "time_filter is string"

    rule = NotNullRule("not_null_name",
                       "not_null",
                       "src",
                       time_filter=[{
                           "column": "created_at"
                       }])
    time_filter = e.compose_where_time_filter(rule)
    computed_datetime = (ctx["task_ts"] -
                         timedelta(days=30)).strftime("%Y-%m-%d %H:%M:%S")
    expected = f"(created_at >= '{computed_datetime} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz)"
    assert time_filter == expected, "time_filter has only column"

    rule = NotNullRule(
        "not_null_name",
        "not_null",
        "src",
        time_filter=[
            {
                "column": "created_at",
                "days": 10
            },
            {
                "column": "updated_at",
                "days": 1
            },
        ],
    )
    time_filter = e.compose_where_time_filter(rule)
    computed_created = (ctx["task_ts"] -
                        timedelta(days=10)).strftime("%Y-%m-%d %H:%M:%S")
    computed_updated = (ctx["task_ts"] -
                        timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
    expected = (
        f"(created_at >= '{computed_created} UTC'::timestamptz AND created_at < '{ctx['task_ts']} UTC'::timestamptz) OR "
        f"(updated_at >= '{computed_updated} UTC'::timestamptz AND updated_at < '{ctx['task_ts']} UTC'::timestamptz)"
    )
    assert time_filter == expected, "time_filter has 2 members"
예제 #12
0
def test_new_rule(conn, ctx):
    class CountSqlRule(SqlRule):
        executor_cls = SqlExecutor

        def __init__(self, name, type, count, description=None, **kwargs):
            super().__init__(name, type, description=description, **kwargs)
            self.count = count

        def get_sql_parameters(self):
            context = super().get_sql_parameters()
            context.update({"target_count": self.count})
            return context

        @property
        def sql(self):
            return """
                SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}}
            """

    conn.execute("""
        drop table if exists public.tmp_table;
    
        create table public.tmp_table(
          a text,
          b text
        );
    
        insert into public.tmp_table(a, b)
        values ('bts', 'abc'), ('aaa', NULL)
    """)

    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)
    rule = CountSqlRule("count_name", "count", 2)
    results = rule.apply(conn)
    expected = AggregatedResult(total_records=1, failed=0, passed=1)
    assert (expected.failed, expected.passed) == (results.failed,
                                                  results.passed)

    rule = CountSqlRule("count_name", "count", 2, condition="a = 'bts'")
    results = rule.apply(conn)
    expected = AggregatedResult(total_records=1, failed=1, passed=0)
    assert (expected.failed, expected.passed) == (results.failed,
                                                  results.passed)
    conn.execute("""DROP TABLE tmp_table;""")
예제 #13
0
def test_new_rule(conn, ctx):
    class CountSqlRule(SqlRule):
        executor_cls = SqlExecutor

        def __init__(self, name, count, **kwargs):
            super().__init__(name, **kwargs)
            self.count = count

        def get_sql_parameters(self):
            context = super().get_sql_parameters()
            context.update({"target_count": self.count})
            return context

        @property
        def sql(self):
            return """
                SELECT COUNT(*) = {{target_count}} FROM {{table_fullname}}
            """

    conn.execute("""
        drop table if exists public.tmp_table;
    
        create table public.tmp_table(
          a text,
          b text
        );
    
        insert into public.tmp_table(a, b)
        values ('bts', 'abc'), ('aaa', NULL)
    """)

    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)
    rule = CountSqlRule("count", 2)
    results = rule.apply(conn)
    expected = pd.Series([True])
    assert list(expected) == list(results)

    rule = CountSqlRule("count", 2, condition="a = 'bts'")
    results = rule.apply(conn)
    expected = pd.Series([False])
    assert list(expected) == list(results)
    conn.execute("""DROP TABLE tmp_table;""")
예제 #14
0
def test_one_column_rule_sql(rule, expected, conn, ctx):
    conn.execute("""
            drop table if exists public.tmp_table;

            create table public.tmp_table(
              value int,
              value2 int
            );

            insert into public.tmp_table(value, value2)
            values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11)
        """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    results = rule.apply(conn)
    expected = pd.Series(expected, name=rule.column)
    assert list(expected) == list(results)
예제 #15
0
def test_sql_missing_jinja_param(conn, ctx):
    ctx["dst_table"] = "public.dst_table"
    refresh_executors(Table("public", "dst_table"), conn, context=ctx)

    sql = """
        select dst
        from {{ dst_table }}
        where dst LIKE '{{ missing_pattern }}'
    """
    rule = CustomSqlRule(
        "sql_test_name",
        "sql_test",
        None,
        sql,
        "example description",
        only_failures_mode=True,
    )

    with pytest.raises(jinja2.exceptions.UndefinedError):
        rule.apply(conn)
예제 #16
0
    def run(
        self,
        raw_rules: List[Dict[str, str]],
        check_table: Dict,
        result_table: Dict,  # todo - docs for quality name, maybe defaults..
        context: Optional[Dict] = None,
    ):
        check_table = Table(**check_table)
        result_table = ResultTable(**result_table)
        context = self.get_context(check_table, context)

        normalized_rules = self.normalize_rules(raw_rules)
        refresh_executors(check_table, self.conn, context)
        quality_check_class = self.get_quality_check_class(result_table)
        self.ensure_table(quality_check_class)

        rules = self.build_rules(normalized_rules)
        objs = self.do_quality_checks(quality_check_class, rules, context)

        self.insert(objs)
예제 #17
0
def test_one_column_rule_sql_condition(rule, expected, conn, ctx):
    conn.execute("""
            drop table if exists public.tmp_table;

            create table public.tmp_table(
              value int,
              conditional boolean,
              date timestamptz
            );

            insert into public.tmp_table(value, conditional, date)
            values (1, TRUE, NULL), (4, TRUE, NULL), (5, FALSE, NULL), (NULL, FALSE, NULL), (4, FALSE, '2019-10-02T13:30:00+0020')
        """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    results = rule.apply(conn)
    expected = pd.Series(expected, name=rule.column)
    assert list(expected) == list(results)
예제 #18
0
def test_cmp_with_other_col_rules(rule, expected, conn, ctx):
    conn.execute("""
        drop table if exists public.tmp_table;

        create table public.tmp_table(
          value1 int,
          value2 int,
          value3 int,
          value4 int
        );

        insert into public.tmp_table(value1, value2, value3, value4)
        values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1)
    """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    results = rule.apply(conn)
    expected = pd.Series(expected, name=rule.column)
    assert list(expected) == list(results)
예제 #19
0
def test_one_column_rule_sql(rule, expected, conn, ctx):
    conn.execute("""
            drop table if exists public.tmp_table;

            create table public.tmp_table(
              value int,
              value2 int
            );

            insert into public.tmp_table(value, value2)
            values (1, 2), (4, 5), (5, 3), (NULL, NULL), (4, 11)
        """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    results = rule.apply(conn)
    assert (expected.total_records, expected.failed, expected.passed) == (
        results.total_records,
        results.failed,
        results.passed,
    )
예제 #20
0
    def construct_automatic_time_filter(
        self,
        left_check_table: Dict,
        created_at_column=None,
        updated_at_column=None,
    ) -> TimeFilter:
        left_check_table = Table(**left_check_table)

        if created_at_column is None and updated_at_column is None:
            raise ValueError(
                "Automatic time filter need at least one time column")

        since_column = updated_at_column or created_at_column
        since_sql = f"SELECT min({since_column}) FROM {left_check_table.fullname}"
        logging.info(since_sql)
        since = self.left_conn.get_records(since_sql).scalar()

        return TimeFilter(
            columns=[
                TimeFilterColumn(since_column, since=since),
            ],
            conjunction=TimeFilterConjunction.AND,
        )
예제 #21
0
def test_cmp_with_other_col_rules(rule, expected, conn, ctx):
    conn.execute("""
        drop table if exists public.tmp_table;

        create table public.tmp_table(
          value1 int,
          value2 int,
          value3 int,
          value4 int
        );

        insert into public.tmp_table(value1, value2, value3, value4)
        values (1, 2, 1, 1), (1, 1, 1, NULL), (1, 1, 1, 1)
    """)
    refresh_executors(Table(schema_name="public", table_name="tmp_table"),
                      conn,
                      context=ctx)

    results = rule.apply(conn)
    assert (expected.total_records, expected.failed, expected.passed) == (
        results.total_records,
        results.failed,
        results.passed,
    )
예제 #22
0
    def setUpClass(cls) -> None:
        cls.conn = Connector(TEST_DB_URI)
        cls.alembic_cfg = Config(ALEMBIC_INI_PATH)

        migration_table_name = cls.alembic_cfg.get_main_option("version_table")
        cls.migration_table = Table(DATA_QUALITY_SCHEMA, migration_table_name)
예제 #23
0
def e(conn, ctx):
    class ConcreteExecutor(Executor):
        def compose_kwargs(self, rule):
            return {}

    return ConcreteExecutor(Table("tmp", "hello_world"), conn, ctx)