示例#1
0
文件: dq.py 项目: OmarNour/dc_pyArrow
    def upgrade_category(self, source_id, category_no, be_att_id):
        be_id = self.get_be_id_by_be_att_id(be_att_id)
        core_tables = get_be_core_table_names(
            self.dnx_config.config_db_url,
            self.dnx_config.org_business_entities_collection, be_id)
        bt_current_dataset = self.dnx_db_path + core_tables[0]
        if is_dir_exists(bt_current_dataset):
            next_cat = self.get_next_be_att_id_category(
                source_id, be_att_id, category_no)
            current_category_dataset = bt_current_dataset + "\\SourceID=" + str(
                source_id) + "\\AttributeID=" + str(
                    be_att_id) + "\\ResetDQStage=" + str(
                        category_no) + "\\process_no=" + str(self.process_no)
            next_category_dataset = bt_current_dataset + "\\SourceID=" + str(
                source_id) + "\\AttributeID=" + str(
                    be_att_id) + "\\ResetDQStage=" + str(
                        next_cat) + "\\process_no=" + str(self.process_no)
            dq_result_dataset = self.result_db_path + core_tables[3]

            partioned_dq_result_dataset = dq_result_dataset + \
                                          "\\SourceID=" + str(source_id) + \
                                          "\\AttributeID=" + str(be_att_id) + \
                                          "\\ResetDQStage=" + str(category_no) + \
                                          "\\process_no="+str(self.process_no) +\
                                          "\\is_issue=0"

            if is_dir_exists(partioned_dq_result_dataset):
                rowkeys = read_all_from_parquet(partioned_dq_result_dataset,
                                                ['RowKey'],
                                                True,
                                                filter=None)
                # rowkeys = read_all_from_parquet_delayed(partioned_dq_result_dataset, ['RowKey']).compute()
                suffix = "_old"
                # bt_dataset_old = current_category_dataset+suffix
                if is_dir_exists(current_category_dataset):
                    bt_dataset_old = self.switch_dataset(
                        current_category_dataset, suffix)

                    rowkeys = rowkeys.set_index('RowKey')
                    # parallel_delayed_upgrade_rowkeys = []
                    for bt_current in read_batches_from_parquet(
                            bt_dataset_old, None,
                            int(self.parameters_dict['bt_batch_size']), True):
                        self.upgrade_rowkeys(bt_current, rowkeys,
                                             current_category_dataset,
                                             next_category_dataset)
                        # delayed_upgrade_rowkeys = delayed(self.upgrade_rowkeys)(delayed(bt_current), delayed(rowkeys), current_category_dataset, next_category_dataset)
                        # self.parallel_delayed_upgrade_rowkeys.append(delayed_upgrade_rowkeys)
                    # compute(*parallel_delayed_upgrade_rowkeys, num_workers=self.cpu_num_workers)
                    delete_dataset(bt_dataset_old)
示例#2
0
文件: dq.py 项目: OmarNour/dc_pyArrow
    def execute_lvl_data_rules(self, src_f_data, base_bt_current_data_set,
                               result_data_set, result_data_set_tmp, source_id,
                               be_att_dr_id, category_no, be_att_id, rule_id,
                               g_result, current_lvl_no, next_pass, next_fail,
                               join_with_f, kwargs):

        # print('++++++++ source_id:', source_id, 'be_att_dr_id:', be_att_dr_id, 'category_no:', category_no)
        # print('++++++++ be_att_id:', be_att_id, 'rule_id:', rule_id, 'g_result:', g_result, 'current_lvl_no:', current_lvl_no,
        #       'next_pass:'******'next_fail:', next_fail)

        result_data_set_tmp = result_data_set_tmp + "_" + str(
            be_att_dr_id) + "_process_no_" + str(self.process_no)

        suffix = "_old"
        result_data_set_tmp_old = self.switch_dataset(result_data_set_tmp,
                                                      suffix)

        for bt_current_data_df in self.get_bt_current_data(
                src_f_data, base_bt_current_data_set, source_id, category_no,
                be_att_id, join_with_f):
            # print('len_bt_current_data_df', len(bt_current_data_df.index))
            if not bt_current_data_df.empty:
                if current_lvl_no > 1:
                    result_df = pd.DataFrame()
                    if is_dir_exists(result_data_set_tmp_old):
                        for row_keys_df in self.get_tmp_rowkeys(
                                result_data_set_tmp_old
                        ):  # filter with level number too!
                            bt_nxt_lvl_current_data_df = bt_current_data_df[
                                bt_current_data_df['RowKey'].isin(row_keys_df)]

                            if not bt_nxt_lvl_current_data_df.empty:
                                result_lvl_df = self.validate_data_rule(
                                    bt_nxt_lvl_current_data_df, be_att_dr_id,
                                    rule_id, kwargs)
                                result_df = result_df.append(result_lvl_df)

                else:
                    result_df = self.validate_data_rule(
                        bt_current_data_df, be_att_dr_id, rule_id, kwargs)
                self.insert_result_df(result_df, g_result, result_data_set,
                                      next_pass, next_fail,
                                      result_data_set_tmp, source_id,
                                      category_no, be_att_id)
        delete_dataset(result_data_set_tmp_old)
示例#3
0
文件: bt.py 项目: OmarNour/dc_pyArrow
 def load_source_data(self, p_be_id=None, no_of_cores=1, cpu_num_workers=1):
     print('start loading BT:', p_be_id)
     be_ids = self.get_be_ids(self.dnx_config.config_db_url, p_be_id)
     parallel_prepare_and_save_src_data = []
     for i, be_id in be_ids.iterrows():
         be_id = be_id['be_id']
         be_source_ids = self.get_be_source_ids(be_id)
         core_tables = get_be_core_table_names(
             self.dnx_config.config_db_url,
             self.dnx_config.org_business_entities_collection, be_id)
         bt_current_collection, bt_collection, source_collection = core_tables[
             0], core_tables[1], core_tables[2]
         self.switch_bt_current_dataset(bt_current_collection)
         # print(be_source_ids)
         for i, source_id in be_source_ids.iterrows():
             source_id = source_id['SourceID']
             connection_credentials = self.get_source_connection_credentials(
                 source_id)
             source_url, source_schema, source_query = connection_credentials[
                 0], connection_credentials[1], connection_credentials[2]
             row_key_column_name = self.get_rowkey_column_name(
                 source_id, be_id)
             f_col = self.get_source_column_name(source_id, be_id)
             source_data_set = self.src_db_path + source_collection
             src_f_data_set = self.src_f_db_path + source_collection
             delete_dataset(source_data_set)
             delete_dataset(src_f_data_set)
             for file_seq, chunk_data in enumerate(
                     get_chuncks_of_data_from_source(
                         source_url, source_schema, source_query,
                         int(self.parameters_dict['source_batch_size']))):
                 # chunk_data = delayed(chunk_data)
                 delayed_prepare_and_save_src_data = delayed(
                     self.prepare_and_save_src_data)(source_id,
                                                     delayed(chunk_data),
                                                     row_key_column_name,
                                                     f_col, no_of_cores,
                                                     source_data_set,
                                                     src_f_data_set)
                 parallel_prepare_and_save_src_data.append(
                     delayed_prepare_and_save_src_data)
     with ProgressBar():
         compute(*parallel_prepare_and_save_src_data,
                 num_workers=cpu_num_workers)
示例#4
0
文件: bt.py 项目: OmarNour/dc_pyArrow
    def etl_be(self, source_id, bt_current_collection, bt_collection,
               source_collection, process_no, cpu_num_workers):
        base_bt_current_data_set = self.dnx_db_path + bt_current_collection
        bt_data_set = self.dnx_db_path + bt_collection
        base_source_data_set = self.src_db_path + source_collection
        source_data_set = base_source_data_set + '\\SourceID=' + str(
            source_id
        ) + '\\' + self.dnx_config.process_no_column_name + '=' + process_no

        bt_current_data_ddf = pd.DataFrame()
        bt_current_data_df = pd.DataFrame()
        bt_current_collection_old = base_bt_current_data_set + "_old"
        if int(self.parameters_dict['get_delta']) == 1:
            if is_dir_exists(bt_current_collection_old):
                pass
                # bt_current_data_ddf = read_all_from_parquet_delayed(dataset=bt_current_collection_old,
                #                                                     columns=bt_columns,
                #                                                     filter=None,
                #                                                     nthreads=self.cpu_num_workers)

        if is_dir_exists(source_data_set):
            parallel_delayed_load_data = []
            for batch_no, get_source_data in enumerate(
                    self.get_chunks_from_source_data(source_id,
                                                     source_data_set)):
                bt_current_data_set = base_bt_current_data_set

                source_data_df, bt_ids = delayed(get_source_data[0]), delayed(
                    get_source_data[1])
                # if is_dir_exists(bt_current_collection_old):
                # bt_ids = delayed(data_to_list)(bt_ids['bt_id'])
                # bt_current_data_df = read_all_from_parquet_delayed(bt_current_collection_old, bt_columns, None)

                # bt_current_data_df = delayed(self.get_bt_current_data)(bt_current_collection_old, bt_columns, bt_ids)

                delayed_load_data = delayed(self.load_data)(
                    source_data_df, bt_current_data_df, bt_data_set,
                    bt_current_data_set, bt_current_collection_old, None)
                parallel_delayed_load_data.append(delayed_load_data)
            with ProgressBar():
                compute(*parallel_delayed_load_data,
                        num_workers=cpu_num_workers)
            delete_dataset(bt_current_collection_old)
示例#5
0
文件: bt.py 项目: OmarNour/dc_pyArrow
 def switch_bt_current_dataset(self, bt_current_collection):
     current_data_set = self.dnx_db_path + bt_current_collection
     current_data_set_old = current_data_set + "_old"
     delete_dataset(current_data_set_old)
     rename_dataset(current_data_set, current_data_set_old)
     return current_data_set_old
示例#6
0
文件: dq.py 项目: OmarNour/dc_pyArrow
 def switch_dataset(self, dataset, suffix):
     current_dataset = dataset
     current_dataset_old = current_dataset + suffix
     delete_dataset(current_dataset_old)
     rename_dataset(current_dataset, current_dataset_old)
     return current_dataset_old
            bt_time = datetime.datetime.now()
            to_run = module_path + '/run_engine.py'
            inputs = "BT=" + str(BT)
            dc_multiprocessing(to_run,
                               no_of_subprocess=None,
                               inputs=inputs,
                               desc=None)
            # 65,010,912 bt current
            bt_end_time = datetime.datetime.now()

        if DQ == 1:
            dq_time = datetime.datetime.now()

            parquet_db_root_path = dnx_config.parquet_db_root_path
            result_db_path = parquet_db_root_path + dnx_config.result_db_name + '\\'
            dc_methods.delete_dataset(result_db_path)

            to_run = module_path + '/run_engine.py'
            source_categories = dq.StartDQ.get_source_categories(
                dnx_config.config_db_url)
            for i, source_id_category_no in source_categories.iterrows():
                category_no = source_id_category_no['category_no']
                # run rules only
                inputs = "DQ=" + str(DQ) + " dq_type=" + str(
                    1) + " dq_category_no=" + str(category_no)
                dc_multiprocessing(to_run,
                                   no_of_subprocess=None,
                                   inputs=inputs,
                                   desc=None,
                                   dq_type=1)