def factor_aggregate_value_not_in_range( data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], arithmetic: EntityColumnAggregateArithmetic) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED column_name = data_service.get_data_entity_helper().get_column_name( factor.name) data = data_service.find_straight_values( columns=[ EntityStraightAggregateColumn(arithmetic=arithmetic, columnName=column_name) ], criteria=build_date_range_criteria(date_range)) if len(data) == 0: # no data found return RuleResult.SUCCESS parsed, value = is_decimal(data[0].get(column_name)) if not parsed: # not a decimal, cannot do comparison return RuleResult.FAILED passed = in_range(value, rule.params.min, rule.params.max) return RuleResult.SUCCESS if passed else RuleResult.FAILED
def exchange_topic_data_service(data_service: TopicDataService, topic_id: TopicId) -> TopicDataService: principal_service = data_service.get_principal_service() topic_service = get_topic_service(principal_service) topic = topic_service.find_by_id(topic_id) if topic is None: raise DqcException(f'Topic[id={topic_id}] not found.') schema = topic_service.find_schema_by_name(topic.name, principal_service.get_tenant_id()) if schema is None: raise DqcException(f'Topic[name={topic.name}] not found.') storage = ask_topic_storage(schema, principal_service) return ask_topic_data_service(schema, storage, data_service.get_principal_service())
def run_retrieve_all_data_rules( data_service: TopicDataService, rules: List[MonitorRule], date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int) -> List[Tuple[MonitorRule, RuleResult]]: """ run rules which should retrieve all data, make sure pass-in rules are qualified, will not check them inside """ rules_by_factor = group_rules_by_factor(rules) factors = find_factors_and_log_missed(data_service, rules_by_factor) data_entity_helper = data_service.get_data_entity_helper() column_names = ArrayHelper(factors).map( lambda x: data_entity_helper.get_column_name(x.name)).to_list() rows = data_service.find_distinct_values( criteria=build_date_range_criteria(date_range), column_names=column_names, distinct_value_on_single_column=True) # deal with data # cast values to decimal since all rules are deal with numbers # value cannot be cast, will be treated as 0 def translate_to_array(data_rows: List[Dict[str, Any]], factor: Factor) -> List[List[Any]]: return ArrayHelper(data_rows) \ .map(lambda x: x.get(factor.name)) \ .map(lambda value: is_decimal(value)) \ .filter(lambda x: x[1] if x[0] else 0) \ .map(lambda x: [x]) \ .to_list() def run_rules(factor: Factor, data: List[Any]) -> List[Tuple[MonitorRule, RuleResult]]: concerned_rules = rules_by_factor.get(factor.factorId) if concerned_rules is None or len(concerned_rules) == 0: return [] def run_rule(rule: MonitorRule) -> Tuple[MonitorRule, RuleResult]: result = retrieve_all_data_rules_map[rule.code]( data_service, factor, data, rule, date_range, changed_rows_count_in_range, total_rows_count) return rule, result return ArrayHelper(concerned_rules).map(run_rule).to_list() return ArrayHelper(factors) \ .map(lambda x: (x, translate_to_array(rows, x))) \ .map(lambda x: run_rules(x[0], x[1])) \ .reduce(lambda all_results, x: [*all_results, *x], [])
def factor_string_length_not_in_range(data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED count = data_service.count_by_criteria([ EntityCriteriaJoint( conjunction=EntityCriteriaJointConjunction.OR, children=[ EntityCriteriaExpression( left=ComputedLiteral( operator=ComputedLiteralOperator.CHAR_LENGTH, elements=[ build_column_name_literal(factor, data_service) ]), operator=EntityCriteriaOperator.LESS_THAN, right=rule.params.min), EntityCriteriaExpression( left=ComputedLiteral( operator=ComputedLiteralOperator.CHAR_LENGTH, elements=[ build_column_name_literal(factor, data_service) ]), operator=EntityCriteriaOperator.GREATER_THAN, right=rule.params.max) ]), *build_date_range_criteria(date_range) ]) return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def find_task_rows(process_date: date, scheduler: TopicSnapshotScheduler, source_topic_schema: TopicSchema, source_topic_service: TopicDataService, principal_service: PrincipalService) -> List[int]: if scheduler.filter is None or scheduler.filter.filters is None or len( scheduler.filter.filters) == 0: rows = source_topic_service.find_distinct_values( None, [TopicDataColumnNames.ID.value], False) else: parsed_criteria = parse_condition_for_storage(scheduler.filter, [source_topic_schema], principal_service, True) variables = build_variables(process_date, scheduler.frequency) rows = source_topic_service.find_distinct_values( [parsed_criteria.run(variables, principal_service)], [TopicDataColumnNames.ID.value], False) return ArrayHelper(rows).map( lambda x: x.get(TopicDataColumnNames.ID.value)).to_list()
def factor_value_assert( data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], assert_expression: Callable[[Factor], EntityCriteriaExpression]) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED count = data_service.count_by_criteria( [assert_expression(factor), *build_date_range_criteria(date_range)]) return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def find_factor( data_service: TopicDataService, factor_id: Optional[FactorId], rule: MonitorRule) -> Tuple[bool, Optional[Factor]]: if is_blank(factor_id): logger.error(f'Factor id not declared on rule[{rule.dict()}].') return False, None topic = data_service.get_topic() factor = ArrayHelper(topic.factors).find(lambda x: x.factorId == factor_id) if factor is None: logger.error(f'Factor[id={factor_id}] on rule[{rule.dict()}] not found.') return False, None else: return True, factor
def run_retrieve_distinct_data_rules( data_service: TopicDataService, rules: List[MonitorRule], date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int) -> List[Tuple[MonitorRule, RuleResult]]: """ run rules which should retrieve distinct data and count, make sure pass-in rules are qualified, will not check them inside """ rules_by_factor = group_rules_by_factor(rules) factors = find_factors_and_log_missed(data_service, rules_by_factor) data_entity_helper = data_service.get_data_entity_helper() # deal with data def translate_to_array(data_rows: List[Dict[str, Any]], factor: Factor) -> List[Tuple[Any, int]]: column_name = data_entity_helper.get_column_name(factor.name) return ArrayHelper(data_rows).map( lambda x: (x.get(column_name), x.get('count'))).to_list() def run_rules(factor: Factor) -> List[Tuple[MonitorRule, RuleResult]]: concerned_rules = rules_by_factor.get(factor.factorId) if concerned_rules is None or len(concerned_rules) == 0: return [] # retrieve data, rows = data_service.find_straight_values( criteria=build_date_range_criteria(date_range), columns=[ EntityStraightAggregateColumn( arithmetic=EntityColumnAggregateArithmetic.COUNT, columnName=data_entity_helper.get_column_name(factor.name), alias='count'), EntityStraightAggregateColumn( columnName=data_entity_helper.get_column_name(factor.name)) ]) data = translate_to_array(rows, factor) def run_rule(rule: MonitorRule) -> Tuple[MonitorRule, RuleResult]: result = retrieve_distinct_data_rules_map[rule.code]( data_service, factor, data, rule, date_range, changed_rows_count_in_range, total_rows_count) return rule, result return ArrayHelper(concerned_rules).map(run_rule).to_list() return ArrayHelper(factors).map(lambda x: run_rules(x)) \ .reduce(lambda all_results, x: [*all_results, *x], [])
def factor_mismatch_type(data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED should, criteria = build_mismatch_statement(factor, data_service) if not should: # not need to detect, ignored return RuleResult.IGNORED count = data_service.count_by_criteria( [*criteria, *build_date_range_criteria(date_range)]) return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def rows_count_mismatch_with_another(data_service: TopicDataService, rule: Optional[MonitorRule], date_range: Tuple[datetime, datetime], has_data: bool) -> int: """ if given count is not none, which means already find the count somewhere, simply use this count as current. anyway, returns the current count """ if has_data: # get count of changed rows of current topic changed_row_count = data_service.count_by_criteria( build_date_range_criteria(date_range)) else: changed_row_count = 0 do_it(data_service, rule, date_range, changed_row_count) return changed_row_count
def factor_empty_over_coverage( data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int ) -> RuleResult: if total_rows_count == 0: return RuleResult.SUCCESS found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED count = data_service.count_by_criteria([ EntityCriteriaExpression( left=build_column_name_literal(factor, data_service), operator=EntityCriteriaOperator.IS_EMPTY ) ]) rate = count / total_rows_count * 100 return RuleResult.SUCCESS if rate > rule.params.coverageRate else RuleResult.FAILED
def factor_string_length_mismatch(data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED count = data_service.count_by_criteria([ EntityCriteriaExpression(left=ComputedLiteral( operator=ComputedLiteralOperator.CHAR_LENGTH, elements=[build_column_name_literal(factor, data_service)]), operator=EntityCriteriaOperator.NOT_EQUALS, right=rule.params.length), *build_date_range_criteria(date_range) ]) return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def do_it(data_service: TopicDataService, rule: Optional[MonitorRule], date_range: Tuple[datetime, datetime], changed_row_count: int) -> None: if rule is None: return # get count of changed rows of another topic another_topic_id = rule.params.topicId if is_blank(another_topic_id): logger.error(f'Another topic id not declared on rule[{rule.dict()}].') return another_data_service = exchange_topic_data_service(data_service, another_topic_id) changed_row_count_of_another = another_data_service.count_by_criteria( build_date_range_criteria(date_range)) trigger( rule, RuleResult.FAILED if changed_row_count != changed_row_count_of_another else RuleResult.SUCCESS, date_range[0], data_service.get_principal_service())
def factor_and_another( data_service: TopicDataService, rule: MonitorRule, date_range: Tuple[datetime, datetime], changed_rows_count_in_range: int, total_rows_count: int ) -> RuleResult: found, factor = find_factor(data_service, rule.factorId, rule) if not found: return RuleResult.IGNORED found, another_factor = find_factor(data_service, rule.params.factorId, rule) if not found: return RuleResult.IGNORED count = data_service.count_by_criteria([ EntityCriteriaExpression( left=build_column_name_literal(factor, data_service), operator=EntityCriteriaOperator.NOT_EQUALS, right=build_column_name_literal(another_factor, data_service), ), *build_date_range_criteria(date_range) ]) return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def build_column_name_literal(factor: Factor, data_service: TopicDataService) -> ColumnNameLiteral: return ColumnNameLiteral(columnName=data_service.get_data_entity_helper().get_column_name(factor.name))