Пример #1
0
 def setUp(self):
     """
     Init a temporary table with some data.
     """
     self.QUALITY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA, "table_1",
                                        QualityCheck)
     self.CONSISTENCY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA, "table_2",
                                            ConsistencyCheck)
     sql = [
         f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;",
         f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};",
         self.ddl_quality_check_0_2_4(self.QUALITY_TABLE_1),
         self.ddl_consistency_check_0_2_4(self.CONSISTENCY_TABLE_1),
         f"""
             INSERT INTO {self.QUALITY_TABLE_1.fullname}(
                 attribute, rule_name, rule_type, rule_description, total_records,
                 time_filter, task_ts)
             VALUES('a', 'stuff', 'not_null', 'This is the rule.', 10, NULL,
                 \'{FakedDatetime.now().isoformat()}\');
         """
         f"""
             INSERT INTO {self.CONSISTENCY_TABLE_1.fullname}(
                 type, name, description, left_table, right_table, status, time_filter, task_ts
             )
             VALUES(
                 \'{ConsistencyChecker.COUNT}\', 'hello', 'aa', 'tmp.a', 'tmp.b', 'hello',
                 NULL, \'{FakedDatetime.now().isoformat()}\'
             );
         """,
     ]
     for s in sql:
         self.conn.execute(s)
Пример #2
0
def test_quality_check_init_row(rule, results, conn: Connector):
    DQBase.metadata.clear()
    qc = create_default_quality_check_class(
        ResultTable(schema_name="data_quality", table_name="booking"))
    assert qc.__tablename__ == "quality_check_booking"
    assert qc.__name__ == "DataQualityQualityCheckBooking"
    t = datetime.datetime(2019, 8, 10, 10, 0, 0)

    qc.__table__.create(conn.engine)
    instance = qc()
    instance.init_row(rule, results, conn, context={"task_ts": t})

    assert instance.task_ts == t
    assert instance.attribute == "src"
    assert instance.rule_name == "not_null"
    assert instance.rule_description == "True when data is null."
    assert instance.total_records == 5
    assert instance.failed == 2
    assert instance.passed == 3
    assert instance.failed_percentage == 40
    assert instance.passed_percentage == 60
    assert instance.median_30_day_failed is None
    assert instance.median_30_day_passed is None
    assert instance.time_filter is None
    assert instance.status == "invalid"
Пример #3
0
    def run(
        self,
        raw_rules: List[Dict[str, str]],
        check_table: Dict,
        result_table: Optional[
            Dict] = None,  # todo - docs for quality name, maybe defaults..
        context: Optional[Dict] = None,
    ) -> Union[CheckResult, QualityCheck]:
        check_table = Table(**check_table)
        context = self.get_context(check_table, context)

        normalized_rules = self.normalize_rules(raw_rules)
        refresh_executors(check_table, self.conn, context)

        if result_table:
            result_table = ResultTable(**result_table,
                                       model_cls=self.model_cls)
            quality_check_class = self.get_quality_check_class(result_table)
            self.conn.ensure_table(quality_check_class.__table__)
        else:
            quality_check_class = CheckResult

        rules = self.build_rules(normalized_rules)
        objs = self.do_quality_checks(quality_check_class, rules, context)

        if result_table:
            self.conn.upsert(objs)
        return objs
Пример #4
0
    def setUp(self):
        """
        Init a temporary table with some data.
        """
        self.DATA_QUALITY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA,
                                                "example_table", QualityCheck)
        self.DATA_QUALITY_TABLE_2 = ResultTable(DATA_QUALITY_SCHEMA,
                                                "another_table", QualityCheck)

        sql = [
            f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;",
            f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};",
            get_quality_table_creation_script_0_1_4(self.DATA_QUALITY_TABLE_1),
            get_quality_table_creation_script_0_1_4(self.DATA_QUALITY_TABLE_2),
            f"""
            INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('src', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75597);
            INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('dst', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75598);
            INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('departure_time', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75599);
            """,
        ]
        for s in sql:
            self.conn.execute(s)
Пример #5
0
    def run(
        self,
        method: str,
        left_check_table: Dict,
        right_check_table: Dict,
        result_table: Optional[Dict] = None,
        columns: Optional[List[str]] = None,
        time_filter: Optional[Union[str, List[Dict], TimeFilter]] = None,
        left_custom_sql: str = None,
        right_custom_sql: str = None,
        context: Optional[Dict] = None,
        example_selector: ExampleSelector = default_example_selector,
    ) -> Union[CheckResult, ConsistencyCheck]:
        if left_custom_sql and right_custom_sql:
            if columns or time_filter:
                raise ValueError(
                    "When using custom sqls you cannot change 'columns' or 'time_filter' attribute"
                )

        time_filter = parse_time_filter(time_filter)

        left_check_table = Table(**left_check_table)
        right_check_table = Table(**right_check_table)
        context = self.get_context(left_check_table, right_check_table,
                                   context)

        result = self.do_consistency_check(
            method,
            columns,
            time_filter,
            left_check_table,
            right_check_table,
            left_custom_sql,
            right_custom_sql,
            context,
            example_selector,
        )

        if result_table:
            result_table = ResultTable(**result_table,
                                       model_cls=self.model_cls)
            quality_check_class = create_default_check_class(result_table)
            self.right_conn.ensure_table(quality_check_class.__table__)
            self.upsert(quality_check_class, result)
            return result

        obj = CheckResult()
        obj.init_row_consistency(**result)
        return obj
Пример #6
0
    def run(
        self,
        raw_rules: List[Dict[str, str]],
        check_table: Dict,
        result_table: Dict,  # todo - docs for quality name, maybe defaults..
        context: Optional[Dict] = None,
    ):
        check_table = Table(**check_table)
        result_table = ResultTable(**result_table)
        context = self.get_context(check_table, context)

        normalized_rules = self.normalize_rules(raw_rules)
        refresh_executors(check_table, self.conn, context)
        quality_check_class = self.get_quality_check_class(result_table)
        self.ensure_table(quality_check_class)

        rules = self.build_rules(normalized_rules)
        objs = self.do_quality_checks(quality_check_class, rules, context)

        self.insert(objs)
Пример #7
0
def test_set_medians(conn: Connector, monkeypatch):
    DQBase.metadata.clear()
    qc = create_default_quality_check_class(
        ResultTable(schema_name="data_quality", table_name="t"))
    qc.__table__.create(conn.engine)
    instance = qc()

    conn.execute("""
        insert into data_quality.quality_check_t(failed, passed, task_ts)
        values
          (10, 200, '2018-09-11T13:00:00'),
          (3, 22, '2018-09-10T13:00:00'),
          (11, 110, '2018-09-09T13:00:00'),
          (55, 476, '2018-09-08T13:00:00'),
          (77, 309, '2018-07-12T13:00:00') -- should not be taken
    """)

    monkeypatch.setattr("contessa.models.datetime", FakedDatetime)
    instance.set_medians(conn)

    assert instance.median_30_day_failed == 10.5
    assert instance.median_30_day_passed == 155
Пример #8
0
def test_set_medians(conn: Connector, monkeypatch):
    DQBase.metadata.clear()
    qc = create_default_check_class(
        ResultTable(schema_name="data_quality",
                    table_name="t",
                    model_cls=QualityCheck))
    qc.__table__.create(conn.engine)
    instance = qc()

    conn.execute("""
        insert into data_quality.quality_check_t(attribute, rule_name, rule_type, failed, passed, task_ts, time_filter)
        values
          ('a', 'b', 'not_null', 10, 200, '2018-09-11T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 3, 22, '2018-09-10T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 11, 110, '2018-09-09T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 55, 476, '2018-09-08T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 77, 309, '2018-07-12T13:00:00', 'not_set') -- should not be taken
    """)

    monkeypatch.setattr("contessa.models.datetime", FakedDatetime)
    instance.set_medians(conn)

    assert instance.median_30_day_failed == 10.5
    assert instance.median_30_day_passed == 155
Пример #9
0
def test_generic_typ_qc_class_no_prefix(dummy_contessa):
    assert (dummy_contessa.get_quality_check_class(
        ResultTable("tmp", "mytable",
                    QualityCheck)).__name__ == "TmpQualityCheckMytable")