def find_last(self, data_id: int, topic_id: TopicId, tenant_id: TenantId) -> Optional[PipelineMonitorLog]: schema = self.get_topic_schema() storage = self.ask_storages().ask_topic_storage(schema) data_service = ask_topic_data_service(schema, storage, self.principalService) # noinspection SpellCheckingInspection page = data_service.page( data_service.get_data_entity_helper().get_entity_pager( criteria=[ EntityCriteriaExpression(left=ColumnNameLiteral( columnName=TopicDataColumnNames.TENANT_ID.value), right=tenant_id), EntityCriteriaExpression( left=ColumnNameLiteral(columnName='topicid'), right=topic_id), EntityCriteriaExpression( left=ColumnNameLiteral(columnName='dataid'), right=data_id) ], sort=[ EntitySortColumn( name=TopicDataColumnNames.INSERT_TIME.value, method=EntitySortMethod.DESC, ) ], pageable=Pageable(pageNumber=1, pageSize=1))) if page.itemCount == 0: return None else: return PipelineMonitorLog(**page.data[0])
async def patch_topic_data( topic_name: Optional[str] = None, patch_type: Optional[PipelineTriggerType] = PipelineTriggerType.MERGE, tenant_id: Optional[TenantId] = None, data=Body(...), principal_service: PrincipalService = Depends(get_any_admin_principal) ) -> None: """ data patch will not trigger any pipeline """ if is_blank(topic_name): raise_400('Topic name is required.') if patch_type is None: patch_type = PipelineTriggerType.MERGE if patch_type == PipelineTriggerType.INSERT_OR_MERGE: raise_400('Patch type can be one of insert/merge/delete.') tenant_id = validate_tenant_id(tenant_id, principal_service) principal_service = fake_to_tenant(principal_service, tenant_id) schema = get_topic_schema(topic_name, tenant_id, principal_service) storage = ask_topic_storage(schema, principal_service) service = ask_topic_data_service(schema, storage, principal_service) if patch_type == PipelineTriggerType.INSERT: service.trigger_by_insert(data) elif patch_type == PipelineTriggerType.MERGE: service.trigger_by_merge(data) elif patch_type == PipelineTriggerType.DELETE: service.trigger_by_delete(data) else: raise DataKernelException( f'Patch type [{patch_type}] is not supported.')
async def fetch_topic_data_count( topic_id: Optional[TopicId] = None, tenant_id: Optional[TenantId] = None, criteria: Optional[ParameterJoint] = None, principal_service: PrincipalService = Depends(get_any_admin_principal) ) -> List[str]: if is_blank(topic_id): raise_400('Topic id is required.') tenant_id = validate_tenant_id(tenant_id, principal_service) principal_service = fake_to_tenant(principal_service, tenant_id) schema = get_topic_service(principal_service).find_schema_by_id( topic_id, tenant_id) storage = ask_topic_storage(schema, principal_service) service = ask_topic_data_service(schema, storage, principal_service) if criteria is None: rows = service.find_distinct_values(None, [TopicDataColumnNames.ID.value], False) else: parsed_criteria = parse_condition_for_storage(criteria, [schema], principal_service, False) empty_variables = PipelineVariables(None, None, None) rows = service.find_distinct_values( [parsed_criteria.run(empty_variables, principal_service)], [TopicDataColumnNames.ID.value], False) return ArrayHelper(rows).map( lambda x: str(x.get(TopicDataColumnNames.ID.value))).to_list()
def find_topic_data_service( topic_id: TopicId, tenant_id: TenantId, principal_service: PrincipalService ) -> Tuple[TopicSchema, TopicDataService]: topic_schema = get_topic_service(principal_service).find_schema_by_id( topic_id, tenant_id) topic_storage = ask_topic_storage(topic_schema, principal_service) topic_service = ask_topic_data_service(topic_schema, topic_storage, principal_service) return topic_schema, topic_service
def find(self, topic_id: TopicId, start_time: datetime, end_time: datetime) -> Optional[TopicProfile]: schema = get_topic_schema(topic_id, self.principalService) if is_raw_topic(schema.get_topic()): raise DqcException(f'Raw topic[name={schema.get_topic().name}] is not supported for profiling.') storage = ask_topic_storage(schema, self.principalService) service = ask_topic_data_service(schema, storage, self.principalService) criteria = [ EntityCriteriaExpression( left=ColumnNameLiteral(columnName=TopicDataColumnNames.TENANT_ID.value), right=self.principalService.get_tenant_id()), EntityCriteriaExpression( left=ColumnNameLiteral(columnName=TopicDataColumnNames.UPDATE_TIME.value), operator=EntityCriteriaOperator.GREATER_THAN_OR_EQUALS, right=start_time), EntityCriteriaExpression( left=ColumnNameLiteral(columnName=TopicDataColumnNames.UPDATE_TIME.value), operator=EntityCriteriaOperator.LESS_THAN_OR_EQUALS, right=end_time) ] data = service.find(criteria) columns = [ TopicDataColumnNames.ID.value, *ArrayHelper(schema.get_topic().factors).map(lambda x: x.name).to_list(), TopicDataColumnNames.TENANT_ID.value, TopicDataColumnNames.INSERT_TIME.value, TopicDataColumnNames.UPDATE_TIME.value ] def row_to_list(row: Dict[str, Any]) -> List[Any]: return ArrayHelper(columns).map(lambda x: row.get(x)).to_list() data_frame = build_data_frame(ArrayHelper(data).map(row_to_list).to_list(), columns) data_frame = convert_data_frame_type_by_topic(data_frame, schema.get_topic()) data_frame.drop([ TopicDataColumnNames.TENANT_ID, TopicDataColumnNames.UPDATE_TIME, TopicDataColumnNames.INSERT_TIME, TopicDataColumnNames.AGGREGATE_ASSIST, TopicDataColumnNames.ID, TopicDataColumnNames.VERSION ], axis=1, inplace=True, errors='ignore') if data_frame.empty or len(data_frame.index) == 1: return None else: logger.info(f'memory_usage {data_frame.memory_usage(deep=True).sum()} bytes') profile = ProfileReport(data_frame, title=f'{schema.get_topic().name} data profile report', minimal=True) json_data = profile.to_json() json_constants_map = { '-Infinity': float('-Infinity'), 'Infinity': float('Infinity'), 'NaN': None, } return loads(json_data, parse_constant=lambda x: json_constants_map[x])
def exchange_topic_data_service(data_service: TopicDataService, topic_id: TopicId) -> TopicDataService: principal_service = data_service.get_principal_service() topic_service = get_topic_service(principal_service) topic = topic_service.find_by_id(topic_id) if topic is None: raise DqcException(f'Topic[id={topic_id}] not found.') schema = topic_service.find_schema_by_name(topic.name, principal_service.get_tenant_id()) if schema is None: raise DqcException(f'Topic[name={topic.name}] not found.') storage = ask_topic_storage(schema, principal_service) return ask_topic_data_service(schema, storage, data_service.get_principal_service())
async def fetch_topic_data( topic_name: Optional[str] = None, topic_id: Optional[TopicId] = None, tenant_id: Optional[TenantId] = None, criteria: TopicPageable = None, principal_service: PrincipalService = Depends(get_any_admin_principal) ) -> DataPage: if is_blank(topic_name) and is_blank(topic_id): raise_400('Topic id or name is required.') tenant_id = validate_tenant_id(tenant_id, principal_service) principal_service = fake_to_tenant(principal_service, tenant_id) if is_not_blank(topic_id): schema = get_topic_service(principal_service).find_schema_by_id( topic_id, tenant_id) else: schema = get_topic_schema(topic_name, tenant_id, principal_service) storage = ask_topic_storage(schema, principal_service) service = ask_topic_data_service(schema, storage, principal_service) pageable = Pageable( pageNumber=1 if criteria is None or criteria.pageNumber is None or criteria.pageNumber <= 0 else criteria.pageNumber, pageSize=100 if criteria is None or criteria.pageSize is None or criteria.pageSize <= 0 else criteria.pageSize) if criteria is None or is_blank( criteria.jointType) or criteria.filters is None: page = service.page_and_unwrap(None, pageable) else: parsed_criteria = parse_condition_for_storage(criteria, [schema], principal_service, False) empty_variables = PipelineVariables(None, None, None) page = service.page_and_unwrap( [parsed_criteria.run(empty_variables, principal_service)], pageable) def id_to_str(row: Dict[str, Any]) -> Dict[str, Any]: if TopicDataColumnNames.ID.value in row: copy = row.copy() copy[TopicDataColumnNames.ID.value] = str( row[TopicDataColumnNames.ID.value]) return copy else: return row page.data = ArrayHelper(page.data).map(id_to_str).to_list() return page
async def truncate_topic_data( topic_name: Optional[str] = None, tenant_id: Optional[TenantId] = None, principal_service: PrincipalService = Depends(get_any_admin_principal) ) -> None: if not ask_truncate_topic_data(): raise_404('Not Found') if is_blank(topic_name): raise_400('Topic name is required.') tenant_id = validate_tenant_id(tenant_id, principal_service) principal_service = fake_to_tenant(principal_service, tenant_id) schema = get_topic_schema(topic_name, tenant_id, principal_service) storage = ask_topic_storage(schema, principal_service) service = ask_topic_data_service(schema, storage, principal_service) service.truncate()
def get_topic_data_service( self, topic_id: TopicId, rules_count: int) -> Tuple[bool, Optional[TopicDataService]]: topic_service = get_topic_service(self.principalService) topic = topic_service.find_by_id(topic_id) if topic is None: # ignore and log logger.error( f'Topic[id={topic_id}] not found, ignored {rules_count} monitor rule(s).' ) return False, None schema = topic_service.find_schema_by_name( topic.name, self.principalService.get_tenant_id()) if schema is None: # ignore and log logger.error( f'Topic[name={topic.name}] not found, ignored {rules_count} monitor rule(s).' ) return False, None storage = ask_topic_storage(schema, self.principalService) data_service = ask_topic_data_service(schema, storage, self.principalService) return True, data_service
async def fetch_topic_data_count( topic_id: Optional[TopicId], tenant_id: Optional[TenantId] = None, criteria: Optional[ParameterJoint] = None, principal_service: PrincipalService = Depends(get_any_admin_principal) ) -> int: if is_blank(topic_id): raise_400('Topic id is required.') tenant_id = validate_tenant_id(tenant_id, principal_service) principal_service = fake_to_tenant(principal_service, tenant_id) schema = get_topic_service(principal_service).find_schema_by_id( topic_id, tenant_id) storage = ask_topic_storage(schema, principal_service) service = ask_topic_data_service(schema, storage, principal_service) if criteria is None: return service.count() else: parsed_criteria = parse_condition_for_storage(criteria, [schema], principal_service, False) empty_variables = PipelineVariables(None, None, None) return service.count_by_criteria( [parsed_criteria.run(empty_variables, principal_service)])
def page(self, criteria: PipelineMonitorLogCriteria) -> DataPage: schema = self.get_topic_schema() storage = self.ask_storages().ask_topic_storage(schema) data_service = ask_topic_data_service(schema, storage, self.principalService) entity_criteria = [ EntityCriteriaExpression(left=ColumnNameLiteral( columnName=TopicDataColumnNames.TENANT_ID.value), right=criteria.tenantId) ] if is_not_blank(criteria.traceId): # noinspection SpellCheckingInspection entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral(columnName='traceid'), right=criteria.traceId)) if is_not_blank(criteria.topicId): # noinspection SpellCheckingInspection entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral(columnName='topicid'), right=criteria.topicId)) if is_not_blank(criteria.pipelineId): # noinspection SpellCheckingInspection entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral(columnName='pipelineid'), right=criteria.pipelineId)) if is_not_blank(criteria.status): entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral(columnName='status'), right=criteria.status)) start_date_parsed, start_date = is_date(criteria.startDate, ask_datetime_formats()) end_date_parsed, end_date = is_date(criteria.endDate, ask_datetime_formats()) if start_date_parsed: entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral( columnName=TopicDataColumnNames.INSERT_TIME.value), operator=EntityCriteriaOperator.GREATER_THAN_OR_EQUALS, right=start_date)) if end_date_parsed: entity_criteria.append( EntityCriteriaExpression( left=ColumnNameLiteral( columnName=TopicDataColumnNames.INSERT_TIME.value), operator=EntityCriteriaOperator.LESS_THAN_OR_EQUALS, right=end_date)) page = data_service.page( data_service.get_data_entity_helper().get_entity_pager( criteria=entity_criteria, pageable=Pageable(pageNumber=criteria.pageNumber, pageSize=criteria.pageSize))) page.data = ArrayHelper(page.data).map(lambda x: x.get(TopicDataColumnNames.RAW_TOPIC_DATA.value)) \ .filter(lambda x: x is not None) \ .map(lambda x: PipelineMonitorLog(**x)) \ .to_list() return page
def ask_topic_data_service(self, schema: TopicSchema) -> TopicDataService: """ ask topic data service """ storage = self.storages.ask_topic_storage(schema) return ask_topic_data_service(schema, storage, self.principalService)