def upgrade_category(self, source_id, category_no, be_att_id): be_id = self.get_be_id_by_be_att_id(be_att_id) core_tables = get_be_core_table_names( self.dnx_config.config_db_url, self.dnx_config.org_business_entities_collection, be_id) bt_current_dataset = self.dnx_db_path + core_tables[0] if is_dir_exists(bt_current_dataset): next_cat = self.get_next_be_att_id_category( source_id, be_att_id, category_no) current_category_dataset = bt_current_dataset + "\\SourceID=" + str( source_id) + "\\AttributeID=" + str( be_att_id) + "\\ResetDQStage=" + str( category_no) + "\\process_no=" + str(self.process_no) next_category_dataset = bt_current_dataset + "\\SourceID=" + str( source_id) + "\\AttributeID=" + str( be_att_id) + "\\ResetDQStage=" + str( next_cat) + "\\process_no=" + str(self.process_no) dq_result_dataset = self.result_db_path + core_tables[3] partioned_dq_result_dataset = dq_result_dataset + \ "\\SourceID=" + str(source_id) + \ "\\AttributeID=" + str(be_att_id) + \ "\\ResetDQStage=" + str(category_no) + \ "\\process_no="+str(self.process_no) +\ "\\is_issue=0" if is_dir_exists(partioned_dq_result_dataset): rowkeys = read_all_from_parquet(partioned_dq_result_dataset, ['RowKey'], True, filter=None) # rowkeys = read_all_from_parquet_delayed(partioned_dq_result_dataset, ['RowKey']).compute() suffix = "_old" # bt_dataset_old = current_category_dataset+suffix if is_dir_exists(current_category_dataset): bt_dataset_old = self.switch_dataset( current_category_dataset, suffix) rowkeys = rowkeys.set_index('RowKey') # parallel_delayed_upgrade_rowkeys = [] for bt_current in read_batches_from_parquet( bt_dataset_old, None, int(self.parameters_dict['bt_batch_size']), True): self.upgrade_rowkeys(bt_current, rowkeys, current_category_dataset, next_category_dataset) # delayed_upgrade_rowkeys = delayed(self.upgrade_rowkeys)(delayed(bt_current), delayed(rowkeys), current_category_dataset, next_category_dataset) # self.parallel_delayed_upgrade_rowkeys.append(delayed_upgrade_rowkeys) # compute(*parallel_delayed_upgrade_rowkeys, num_workers=self.cpu_num_workers) delete_dataset(bt_dataset_old)
def execute_lvl_data_rules(self, src_f_data, base_bt_current_data_set, result_data_set, result_data_set_tmp, source_id, be_att_dr_id, category_no, be_att_id, rule_id, g_result, current_lvl_no, next_pass, next_fail, join_with_f, kwargs): # print('++++++++ source_id:', source_id, 'be_att_dr_id:', be_att_dr_id, 'category_no:', category_no) # print('++++++++ be_att_id:', be_att_id, 'rule_id:', rule_id, 'g_result:', g_result, 'current_lvl_no:', current_lvl_no, # 'next_pass:'******'next_fail:', next_fail) result_data_set_tmp = result_data_set_tmp + "_" + str( be_att_dr_id) + "_process_no_" + str(self.process_no) suffix = "_old" result_data_set_tmp_old = self.switch_dataset(result_data_set_tmp, suffix) for bt_current_data_df in self.get_bt_current_data( src_f_data, base_bt_current_data_set, source_id, category_no, be_att_id, join_with_f): # print('len_bt_current_data_df', len(bt_current_data_df.index)) if not bt_current_data_df.empty: if current_lvl_no > 1: result_df = pd.DataFrame() if is_dir_exists(result_data_set_tmp_old): for row_keys_df in self.get_tmp_rowkeys( result_data_set_tmp_old ): # filter with level number too! bt_nxt_lvl_current_data_df = bt_current_data_df[ bt_current_data_df['RowKey'].isin(row_keys_df)] if not bt_nxt_lvl_current_data_df.empty: result_lvl_df = self.validate_data_rule( bt_nxt_lvl_current_data_df, be_att_dr_id, rule_id, kwargs) result_df = result_df.append(result_lvl_df) else: result_df = self.validate_data_rule( bt_current_data_df, be_att_dr_id, rule_id, kwargs) self.insert_result_df(result_df, g_result, result_data_set, next_pass, next_fail, result_data_set_tmp, source_id, category_no, be_att_id) delete_dataset(result_data_set_tmp_old)
def load_source_data(self, p_be_id=None, no_of_cores=1, cpu_num_workers=1): print('start loading BT:', p_be_id) be_ids = self.get_be_ids(self.dnx_config.config_db_url, p_be_id) parallel_prepare_and_save_src_data = [] for i, be_id in be_ids.iterrows(): be_id = be_id['be_id'] be_source_ids = self.get_be_source_ids(be_id) core_tables = get_be_core_table_names( self.dnx_config.config_db_url, self.dnx_config.org_business_entities_collection, be_id) bt_current_collection, bt_collection, source_collection = core_tables[ 0], core_tables[1], core_tables[2] self.switch_bt_current_dataset(bt_current_collection) # print(be_source_ids) for i, source_id in be_source_ids.iterrows(): source_id = source_id['SourceID'] connection_credentials = self.get_source_connection_credentials( source_id) source_url, source_schema, source_query = connection_credentials[ 0], connection_credentials[1], connection_credentials[2] row_key_column_name = self.get_rowkey_column_name( source_id, be_id) f_col = self.get_source_column_name(source_id, be_id) source_data_set = self.src_db_path + source_collection src_f_data_set = self.src_f_db_path + source_collection delete_dataset(source_data_set) delete_dataset(src_f_data_set) for file_seq, chunk_data in enumerate( get_chuncks_of_data_from_source( source_url, source_schema, source_query, int(self.parameters_dict['source_batch_size']))): # chunk_data = delayed(chunk_data) delayed_prepare_and_save_src_data = delayed( self.prepare_and_save_src_data)(source_id, delayed(chunk_data), row_key_column_name, f_col, no_of_cores, source_data_set, src_f_data_set) parallel_prepare_and_save_src_data.append( delayed_prepare_and_save_src_data) with ProgressBar(): compute(*parallel_prepare_and_save_src_data, num_workers=cpu_num_workers)
def etl_be(self, source_id, bt_current_collection, bt_collection, source_collection, process_no, cpu_num_workers): base_bt_current_data_set = self.dnx_db_path + bt_current_collection bt_data_set = self.dnx_db_path + bt_collection base_source_data_set = self.src_db_path + source_collection source_data_set = base_source_data_set + '\\SourceID=' + str( source_id ) + '\\' + self.dnx_config.process_no_column_name + '=' + process_no bt_current_data_ddf = pd.DataFrame() bt_current_data_df = pd.DataFrame() bt_current_collection_old = base_bt_current_data_set + "_old" if int(self.parameters_dict['get_delta']) == 1: if is_dir_exists(bt_current_collection_old): pass # bt_current_data_ddf = read_all_from_parquet_delayed(dataset=bt_current_collection_old, # columns=bt_columns, # filter=None, # nthreads=self.cpu_num_workers) if is_dir_exists(source_data_set): parallel_delayed_load_data = [] for batch_no, get_source_data in enumerate( self.get_chunks_from_source_data(source_id, source_data_set)): bt_current_data_set = base_bt_current_data_set source_data_df, bt_ids = delayed(get_source_data[0]), delayed( get_source_data[1]) # if is_dir_exists(bt_current_collection_old): # bt_ids = delayed(data_to_list)(bt_ids['bt_id']) # bt_current_data_df = read_all_from_parquet_delayed(bt_current_collection_old, bt_columns, None) # bt_current_data_df = delayed(self.get_bt_current_data)(bt_current_collection_old, bt_columns, bt_ids) delayed_load_data = delayed(self.load_data)( source_data_df, bt_current_data_df, bt_data_set, bt_current_data_set, bt_current_collection_old, None) parallel_delayed_load_data.append(delayed_load_data) with ProgressBar(): compute(*parallel_delayed_load_data, num_workers=cpu_num_workers) delete_dataset(bt_current_collection_old)
def switch_bt_current_dataset(self, bt_current_collection): current_data_set = self.dnx_db_path + bt_current_collection current_data_set_old = current_data_set + "_old" delete_dataset(current_data_set_old) rename_dataset(current_data_set, current_data_set_old) return current_data_set_old
def switch_dataset(self, dataset, suffix): current_dataset = dataset current_dataset_old = current_dataset + suffix delete_dataset(current_dataset_old) rename_dataset(current_dataset, current_dataset_old) return current_dataset_old
bt_time = datetime.datetime.now() to_run = module_path + '/run_engine.py' inputs = "BT=" + str(BT) dc_multiprocessing(to_run, no_of_subprocess=None, inputs=inputs, desc=None) # 65,010,912 bt current bt_end_time = datetime.datetime.now() if DQ == 1: dq_time = datetime.datetime.now() parquet_db_root_path = dnx_config.parquet_db_root_path result_db_path = parquet_db_root_path + dnx_config.result_db_name + '\\' dc_methods.delete_dataset(result_db_path) to_run = module_path + '/run_engine.py' source_categories = dq.StartDQ.get_source_categories( dnx_config.config_db_url) for i, source_id_category_no in source_categories.iterrows(): category_no = source_id_category_no['category_no'] # run rules only inputs = "DQ=" + str(DQ) + " dq_type=" + str( 1) + " dq_category_no=" + str(category_no) dc_multiprocessing(to_run, no_of_subprocess=None, inputs=inputs, desc=None, dq_type=1)