def initial_report(name: str, project_id: str, schedule_type: str, schedule_date_key: int, schedule_time_key: int): to_zone = tz.gettz('Asia/Ho_Chi_Minh') from_zone = tz.tzutc() utc = datetime.datetime.utcnow() utc = utc.replace(tzinfo=from_zone) now = utc.astimezone(to_zone) report = FactEtlReportModel(project_id=project_id, job_name=name, executor_date_timestamp=now, executor_date_key=func.time_to_date_key(now), executor_time_key=func.time_to_time_key(now), schedule_type=schedule_type, schedule_date_key=schedule_date_key, schedule_time_key=schedule_time_key) return report
def fact_performance(self): report = report_func.initial_report('fact_performance', self.project_id, self.schedule_type, self.schedule_date_key, \ self.schedule_time_key) start_run = time.time() try: datas = [] data_performance = self.get_performance() _id = 1 for performance in data_performance: process_key = func.get_process_key_performance_gda(performance['type'], performance['task_def_key']) captured_date_timestamp_utc_7 = performance['time'] + datetime.timedelta(hours = 7) document_key = None obj_ = FactPerformanceModel( performance_key = _id, ori_performance_id = func.bson_object_to_string(performance['_id']), document_key = document_key, project_id = self.project_id, group_id = performance['group_id'], document_id = performance['doc_id'], reworked = func.int_to_bool(performance['rework_count']), work_type_key = func.get_working_type_id_by_name(performance['work_type']), process_key = func.get_process_key_performance_gda(performance['type'], performance['task_def_key']), number_of_record = performance['records'], number_of_item = performance['items'], number_of_field = performance['fields'], number_of_character = performance['chars'], user_name = performance['username'], ip = performance['ip'], captured_date_timestamp = captured_date_timestamp_utc_7, captured_date_key = func.time_to_date_key(captured_date_timestamp_utc_7), captured_time_key = func.time_to_time_key(captured_date_timestamp_utc_7), total_time_second = performance['total_time'] ) datas.append(obj_) _id+=1 self.db.update([item.__dict__ for item in datas], self.schema, self.fact_performance_table) report.status_code = 'PASSED' except Exception as e: report.status_code = 'FAILED' report.description = str(e) finally: report.total_time_run_second = time.time()-start_run self.reports.append(report) return report
def fact_data_extract(self): report = report_func.initial_report('fact_data_extraction', self.project_id, self.schedule_type, self.schedule_date_key, \ self.schedule_time_key) start_run = time.time() try: data_docs, data_trans = self.get_docs_and_trans() col_ignores = ['ImagePath'] trans_ignore = ['doc_id', 'doc_uri', 'fileName', 'fileName_Bad', 'filter_control', 'getBatchName', 'keyer', \ 'keyer_proof', 'keyer_type', 'FolderOutput', 'Image'] results = [] for data in data_docs: records = data['records'] document_id = func.bson_object_to_string(data['_id']) document_key = None doc_set_id = func.bson_object_to_string(data['doc_set_id']) for i in range(len(records)): record_id = i+1 record = records[i] for key, value in record.items(): if key == 'keyed_data': for keyed_data in value: source = keyed_data['source'] task_def_key = keyed_data['task_def_key'] data_needed = keyed_data['data'][0].items() last_modified_utc_7 = keyed_data['createdtime'] + datetime.timedelta(hours = 7) user_name = keyed_data['keyer'] performance_key = None if source != 'queue_transform' and task_def_key.startswith('Type'): process_key = 3 # human input keyed_data kpi if source != 'queue_transform' and task_def_key == 'Verify_Hold_Type': process_key = 12 # human check bad_image keyed_data not kpi elif source == 'queue_transform' and task_def_key.startswith('Type'): process_key = 4 # 'machine save input keyed_data' elif source != 'queue_transform' and task_def_key.startswith('Proof'): process_key = 5 # human qc input keyed_data' kpi elif source == 'queue_transform' and task_def_key.startswith('Proof'): process_key = 6 # 'machine save qc keyed_data' for field_name, field_value_dict in data_needed: if field_name in col_ignores: continue _obj = FactDataExtractionModel( document_key = document_key, performance_key = performance_key, ori_document_id = document_id, project_id = self.project_id, document_id = document_id, doc_set_id = doc_set_id, record_id = record_id, last_modified_date_key = func.time_to_date_key(last_modified_utc_7), last_modified_time_key = func.time_to_time_key(last_modified_utc_7), last_modified_timestamp = last_modified_utc_7, user_name = user_name, process_key = process_key, field_name = field_name, field_value = field_value_dict['text'] ) results.append(_obj) elif key == 'final_data': final_data = value[0] data_needed = final_data['data'][0].items() last_modified_utc_7 = final_data['createdtime'] + datetime.timedelta(hours = 7) user_name = final_data['keyer'] process_key = 10 for field_name, field_value_dict in data_needed: if field_name in col_ignores: continue _obj = FactDataExtractionModel( document_key = document_key, performance_key = None, ori_document_id = document_id, project_id = self.project_id, document_id = document_id, doc_set_id = doc_set_id, record_id = record_id, last_modified_date_key = func.time_to_date_key(last_modified_utc_7), last_modified_time_key = func.time_to_time_key(last_modified_utc_7), last_modified_timestamp = last_modified_utc_7, user_name = user_name, process_key = process_key, field_name = field_name, field_value = field_value_dict['text'] ) results.append(_obj) elif key == 'qc_ed_data': qc_ed_data = value[0][0] if 'qc_fields_err' not in qc_ed_data.keys(): continue qc_ed_data_err = qc_ed_data['qc_fields_err'] data_needed = qc_ed_data_err[0].items() last_modified_utc_7 = qc_ed_data['createdtime'] + datetime.timedelta(hours = 7) user_name = qc_ed_data['keyer'] process_key = 8 performance_key = None for field_name, field_value_dict in data_needed: if field_name in col_ignores: continue _obj = FactDataExtractionModel( document_key = document_key, performance_key = performance_key, ori_document_id = document_id, project_id = self.project_id, document_id = document_id, doc_set_id = doc_set_id, record_id = record_id, last_modified_date_key = func.time_to_date_key(last_modified_utc_7), last_modified_time_key = func.time_to_time_key(last_modified_utc_7), last_modified_timestamp = last_modified_utc_7, user_name = user_name, process_key = process_key, field_name = field_name, field_value = field_value_dict['text'] ) results.append(_obj) elif key == 'apr_ed_data': report.description = 'Not handle aprove qc data because not have sample data' print('F**K???????: ') for data in data_trans: document_id = func.bson_object_to_string(data['doc_id']) document_key = None ori_document_id = func.bson_object_to_string(data['_id']) doc_set_id = func.bson_object_to_string(data['doc_set_id']) performance_key = None records = data['records'] last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7) for i in range(len(records)): record = records[i] record_id = i + 1 data_needed = record.items() for field_name, field_value in data_needed: if field_name in trans_ignore: continue _obj = FactDataExtractionModel( document_key = document_key, performance_key = performance_key, ori_document_id = ori_document_id, project_id = self.project_id, document_id = document_id, doc_set_id = doc_set_id, record_id = record_id, last_modified_date_key = func.time_to_date_key(last_modified_utc_7), last_modified_time_key = func.time_to_time_key(last_modified_utc_7), last_modified_timestamp = last_modified_utc_7, user_name = None, process_key = 11, field_name = field_name, field_value = field_value ) results.append(_obj) self.db.update([item.__dict__ for item in results], self.schema, self.fact_data_extraction_table) report.status_code = 'PASSED' except Exception as e: report.status_code = 'FAILED' report.description = str(e) finally: report.total_time_run_second = time.time()-start_run self.reports.append(report) return report
def fact_performance(self): report = report_func.initial_report('fact_performance', self.project_id, self.schedule_type, self.schedule_date_key, \ self.schedule_time_key) start_run = time.time() try: datas = [] data_performance = self.get_performance() _id = self.db.get_max_id_table(schema=self.schema, table=self.fact_performance_table, col='performance_key') if _id == None: _id = 1 else: _id += 1 test_3 = 0 test_5 = 0 test_8 = 0 types = [] tasks = [] for performance in data_performance: process_key = func.get_process_key_performance_gda( performance['type'], performance['task_def_key']) if performance['type'] not in types: types.append(performance['type']) if performance['task_def_key'] not in tasks: types.append(performance['task_def_key']) if process_key == 3: test_3 += 1 if process_key == 5: test_5 += 1 if process_key == 8: test_8 += 1 captured_date_timestamp_utc_7 = performance[ 'time'] + datetime.timedelta(hours=7) document_key = self.get_document_key_by_document_id( performance['doc_id']) obj_ = FactPerformanceModel( performance_key=_id, ori_performance_id=func.bson_object_to_string( performance['_id']), document_key=document_key, project_id=self.project_id, group_id=performance['group_id'], document_id=performance['doc_id'], reworked=func.int_to_bool(performance['rework_count']), work_type_key=func.get_working_type_id_by_name( performance['work_type']), process_key=func.get_process_key_performance_gda( performance['type'], performance['task_def_key']), number_of_record=performance['records'], number_of_item=performance['items'], number_of_field=performance['fields'], number_of_character=performance['chars'], user_name=performance['username'], ip=performance['ip'], captured_date_timestamp=captured_date_timestamp_utc_7, captured_date_key=func.time_to_date_key( captured_date_timestamp_utc_7), captured_time_key=func.time_to_time_key( captured_date_timestamp_utc_7), total_time_second=performance['total_time']) datas.append(obj_) self.performance_key_checks.append({ 'performance_key': _id, 'user_name': performance['username'], 'document_key': document_key, 'module_type': performance['type'], 'task_def_key': performance['task_def_key'] }) _id += 1 print('types', types) print('tasks', tasks) print('human_input', test_3) print('human_verify_input', test_5) print('human_qc_data', test_8) self.db.create([item.__dict__ for item in datas], self.schema, self.fact_performance_table) report.status_code = 'PASSED' except Exception as e: report.status_code = 'FAILED' report.description = str(e) finally: report.total_time_run_second = time.time() - start_run self.reports.append(report) return report