예제 #1
0
파일: dq.py 프로젝트: OmarNour/dc_pyArrow
    def upgrade_category(self, source_id, category_no, be_att_id):
        be_id = self.get_be_id_by_be_att_id(be_att_id)
        core_tables = get_be_core_table_names(
            self.dnx_config.config_db_url,
            self.dnx_config.org_business_entities_collection, be_id)
        bt_current_dataset = self.dnx_db_path + core_tables[0]
        if is_dir_exists(bt_current_dataset):
            next_cat = self.get_next_be_att_id_category(
                source_id, be_att_id, category_no)
            current_category_dataset = bt_current_dataset + "\\SourceID=" + str(
                source_id) + "\\AttributeID=" + str(
                    be_att_id) + "\\ResetDQStage=" + str(
                        category_no) + "\\process_no=" + str(self.process_no)
            next_category_dataset = bt_current_dataset + "\\SourceID=" + str(
                source_id) + "\\AttributeID=" + str(
                    be_att_id) + "\\ResetDQStage=" + str(
                        next_cat) + "\\process_no=" + str(self.process_no)
            dq_result_dataset = self.result_db_path + core_tables[3]

            partioned_dq_result_dataset = dq_result_dataset + \
                                          "\\SourceID=" + str(source_id) + \
                                          "\\AttributeID=" + str(be_att_id) + \
                                          "\\ResetDQStage=" + str(category_no) + \
                                          "\\process_no="+str(self.process_no) +\
                                          "\\is_issue=0"

            if is_dir_exists(partioned_dq_result_dataset):
                rowkeys = read_all_from_parquet(partioned_dq_result_dataset,
                                                ['RowKey'],
                                                True,
                                                filter=None)
                # rowkeys = read_all_from_parquet_delayed(partioned_dq_result_dataset, ['RowKey']).compute()
                suffix = "_old"
                # bt_dataset_old = current_category_dataset+suffix
                if is_dir_exists(current_category_dataset):
                    bt_dataset_old = self.switch_dataset(
                        current_category_dataset, suffix)

                    rowkeys = rowkeys.set_index('RowKey')
                    # parallel_delayed_upgrade_rowkeys = []
                    for bt_current in read_batches_from_parquet(
                            bt_dataset_old, None,
                            int(self.parameters_dict['bt_batch_size']), True):
                        self.upgrade_rowkeys(bt_current, rowkeys,
                                             current_category_dataset,
                                             next_category_dataset)
                        # delayed_upgrade_rowkeys = delayed(self.upgrade_rowkeys)(delayed(bt_current), delayed(rowkeys), current_category_dataset, next_category_dataset)
                        # self.parallel_delayed_upgrade_rowkeys.append(delayed_upgrade_rowkeys)
                    # compute(*parallel_delayed_upgrade_rowkeys, num_workers=self.cpu_num_workers)
                    delete_dataset(bt_dataset_old)
예제 #2
0
파일: bt.py 프로젝트: OmarNour/dc_pyArrow
    def etl_be(self, source_id, bt_current_collection, bt_collection,
               source_collection, process_no, cpu_num_workers):
        base_bt_current_data_set = self.dnx_db_path + bt_current_collection
        bt_data_set = self.dnx_db_path + bt_collection
        base_source_data_set = self.src_db_path + source_collection
        source_data_set = base_source_data_set + '\\SourceID=' + str(
            source_id
        ) + '\\' + self.dnx_config.process_no_column_name + '=' + process_no

        bt_current_data_ddf = pd.DataFrame()
        bt_current_data_df = pd.DataFrame()
        bt_current_collection_old = base_bt_current_data_set + "_old"
        if int(self.parameters_dict['get_delta']) == 1:
            if is_dir_exists(bt_current_collection_old):
                pass
                # bt_current_data_ddf = read_all_from_parquet_delayed(dataset=bt_current_collection_old,
                #                                                     columns=bt_columns,
                #                                                     filter=None,
                #                                                     nthreads=self.cpu_num_workers)

        if is_dir_exists(source_data_set):
            parallel_delayed_load_data = []
            for batch_no, get_source_data in enumerate(
                    self.get_chunks_from_source_data(source_id,
                                                     source_data_set)):
                bt_current_data_set = base_bt_current_data_set

                source_data_df, bt_ids = delayed(get_source_data[0]), delayed(
                    get_source_data[1])
                # if is_dir_exists(bt_current_collection_old):
                # bt_ids = delayed(data_to_list)(bt_ids['bt_id'])
                # bt_current_data_df = read_all_from_parquet_delayed(bt_current_collection_old, bt_columns, None)

                # bt_current_data_df = delayed(self.get_bt_current_data)(bt_current_collection_old, bt_columns, bt_ids)

                delayed_load_data = delayed(self.load_data)(
                    source_data_df, bt_current_data_df, bt_data_set,
                    bt_current_data_set, bt_current_collection_old, None)
                parallel_delayed_load_data.append(delayed_load_data)
            with ProgressBar():
                compute(*parallel_delayed_load_data,
                        num_workers=cpu_num_workers)
            delete_dataset(bt_current_collection_old)
예제 #3
0
파일: dq.py 프로젝트: OmarNour/dc_pyArrow
    def execute_data_rules(self, category_no, be_att_dr_id, source_id,
                           join_with_f):
        # print('execute_data_rules started')
        be_data_rule_lvls_query = "select be_att_id, rule_id, next_pass, next_fail, kwargs from " + \
                                  self.dnx_config.be_attributes_data_rules_lvls_collection + \
                                  " where active = 1 and be_att_dr_id = " + str(be_att_dr_id) + " order by level_no"
        be_data_rule_lvls = get_all_data_from_source(
            self.dnx_config.config_db_url, None, be_data_rule_lvls_query)
        no_of_lvls = len(be_data_rule_lvls.index)

        for current_lvl_no, data_rule_lvls in enumerate(
                be_data_rule_lvls.iterrows(), start=1):
            data_rule_lvls = data_rule_lvls[1]
            be_att_id = data_rule_lvls['be_att_id']
            rule_id = data_rule_lvls['rule_id']
            next_pass = data_rule_lvls['next_pass']
            next_fail = data_rule_lvls['next_fail']
            kwargs = data_rule_lvls['kwargs']

            g_result = 1 if no_of_lvls == current_lvl_no else 0
            # print('no_of_lvls', be_att_dr_id, g_result, no_of_lvls, current_lvl_no)

            be_id = self.get_be_id_by_be_att_id(str(be_att_id))
            core_tables = get_be_core_table_names(
                self.dnx_config.config_db_url,
                self.dnx_config.org_business_entities_collection, be_id)
            bt_current_collection = core_tables[0]
            source_collection = core_tables[2]
            dq_result_collection = core_tables[3]

            # print(core_tables)
            base_bt_current_data_set = self.dnx_db_path + bt_current_collection
            src_f_data_set = self.src_f_db_path + source_collection + "\\SourceID=" + str(
                source_id)
            result_data_set = self.result_db_path + dq_result_collection
            # self.all_result_data_set.append(result_data_set) if result_data_set not in self.all_result_data_set else None
            result_data_set_tmp = result_data_set + "_tmp"

            if current_lvl_no == 1 and join_with_f == 1:
                src_f_data = read_all_from_parquet_delayed(src_f_data_set)
            else:
                src_f_data = None
                # src_f_data = src_f_data.set_index('rowkey')
            if is_dir_exists(base_bt_current_data_set):
                self.execute_lvl_data_rules(
                    src_f_data, base_bt_current_data_set, result_data_set,
                    result_data_set_tmp, source_id, be_att_dr_id, category_no,
                    be_att_id, rule_id, g_result, current_lvl_no, next_pass,
                    next_fail, join_with_f, kwargs)
예제 #4
0
파일: bt.py 프로젝트: OmarNour/dc_pyArrow
    def load_data(self,
                  p_source_data,
                  p_current_data,
                  bt_data_set,
                  bt_current_data_set,
                  bt_current_collection_old,
                  p_bt_ids=None):
        if int(self.parameters_dict['get_delta']) == 1 and is_dir_exists(
                bt_current_collection_old):
            # bt_ids = p_bt_ids.set_index('bt_id')
            # bt_current_data_df = p_current_data.merge(bt_ids, left_index=True, right_index=True)
            # bt_current_data_df = bt_current_data_df.compute()
            # print('p_bt_idsp_bt_ids', p_bt_ids)
            # filter_bt_ids = [['bt_id', p_bt_ids], ]
            # bt_current_data_df = self.get_bt_current_data(bt_current_collection_old, bt_columns, filter_bt_ids)
            # print('bt_current_data_df', len(bt_current_data_df.index))
            # print(bt_current_data_df.index)
            # print('p_source_data', len(p_source_data.index))
            # print(p_source_data.index)
            get_delta_result = self.get_delta(p_source_data, p_current_data)
            same_df = get_delta_result[5]
            save_to_parquet(same_df, bt_current_data_set, bt_partition_cols,
                            bt_object_cols)

            if get_delta_result[3] in (0, 2):  #etl_occurred
                assert len(get_delta_result[0]) == len(get_delta_result[1])

                modified_df = get_delta_result[0]
                expired_df = get_delta_result[1]
                # expired_ids = get_delta_result[4]

                # modified_df['batch_no'] = batch_no
                save_to_parquet(modified_df, bt_current_data_set,
                                bt_partition_cols, bt_object_cols)

                # expired_df['batch_no'] = batch_no
                save_to_parquet(expired_df, bt_data_set, bt_partition_cols,
                                bt_object_cols)

            if get_delta_result[3] in (1, 2):  # etl_occurred
                new_data_df = get_delta_result[2]
                save_to_parquet(new_data_df, bt_current_data_set,
                                bt_partition_cols, bt_object_cols)
        else:
            save_to_parquet(p_source_data, bt_current_data_set,
                            bt_partition_cols, bt_object_cols)
예제 #5
0
파일: dq.py 프로젝트: OmarNour/dc_pyArrow
    def execute_lvl_data_rules(self, src_f_data, base_bt_current_data_set,
                               result_data_set, result_data_set_tmp, source_id,
                               be_att_dr_id, category_no, be_att_id, rule_id,
                               g_result, current_lvl_no, next_pass, next_fail,
                               join_with_f, kwargs):

        # print('++++++++ source_id:', source_id, 'be_att_dr_id:', be_att_dr_id, 'category_no:', category_no)
        # print('++++++++ be_att_id:', be_att_id, 'rule_id:', rule_id, 'g_result:', g_result, 'current_lvl_no:', current_lvl_no,
        #       'next_pass:'******'next_fail:', next_fail)

        result_data_set_tmp = result_data_set_tmp + "_" + str(
            be_att_dr_id) + "_process_no_" + str(self.process_no)

        suffix = "_old"
        result_data_set_tmp_old = self.switch_dataset(result_data_set_tmp,
                                                      suffix)

        for bt_current_data_df in self.get_bt_current_data(
                src_f_data, base_bt_current_data_set, source_id, category_no,
                be_att_id, join_with_f):
            # print('len_bt_current_data_df', len(bt_current_data_df.index))
            if not bt_current_data_df.empty:
                if current_lvl_no > 1:
                    result_df = pd.DataFrame()
                    if is_dir_exists(result_data_set_tmp_old):
                        for row_keys_df in self.get_tmp_rowkeys(
                                result_data_set_tmp_old
                        ):  # filter with level number too!
                            bt_nxt_lvl_current_data_df = bt_current_data_df[
                                bt_current_data_df['RowKey'].isin(row_keys_df)]

                            if not bt_nxt_lvl_current_data_df.empty:
                                result_lvl_df = self.validate_data_rule(
                                    bt_nxt_lvl_current_data_df, be_att_dr_id,
                                    rule_id, kwargs)
                                result_df = result_df.append(result_lvl_df)

                else:
                    result_df = self.validate_data_rule(
                        bt_current_data_df, be_att_dr_id, rule_id, kwargs)
                self.insert_result_df(result_df, g_result, result_data_set,
                                      next_pass, next_fail,
                                      result_data_set_tmp, source_id,
                                      category_no, be_att_id)
        delete_dataset(result_data_set_tmp_old)
예제 #6
0
파일: bt.py 프로젝트: OmarNour/dc_pyArrow
    def get_bt_current_data(self, bt_dataset, columns, bt_ids):
        bt_df = pd.DataFrame()
        if is_dir_exists(bt_dataset):
            filter_bt_ids = [
                ['bt_id', bt_ids],
            ]
            # bt_df = read_all_from_parquet_delayed(dataset=bt_dataset,
            #                                               columns=columns,
            #                                               use_threads=True,#self.cpu_num_workers,
            # filter=filter).compute()
            for df in read_batches_from_parquet(
                    bt_dataset,
                    columns,
                    int(self.parameters_dict['bt_batch_size']),
                    True,  #self.cpu_num_workers,
                    filter=filter_bt_ids):
                if not df.empty:
                    # print('bt_df.columns', bt_df.columns)
                    # print('bt_df.info', bt_df.info())
                    bt_df = bt_df.append(df)

        return bt_df
예제 #7
0
파일: dq.py 프로젝트: OmarNour/dc_pyArrow
 def show_results(config_db_url, result_db_path,
                  org_business_entities_collection):
     print(
         "**********************************************************************"
     )
     be_ids = bt.StartBT.get_be_ids(config_db_url)
     for i, be_id in be_ids.iterrows():
         be_id = be_id['be_id']
         core_tables = get_be_core_table_names(
             config_db_url, org_business_entities_collection, be_id)
         dq_result_collection = core_tables[3]
         result_path = result_db_path + dq_result_collection
         # result_path =None
         # result_path = "C:\\dc\\parquet_db\\Result\\result_4383_10"
         if is_dir_exists(result_path):
             df1 = dd.read_parquet(path=result_path, engine='pyarrow')[[
                 'p_SourceID', 'p_AttributeID', 'p_ResetDQStage',
                 'p_be_att_dr_id', 'p_data_rule_id', 'p_is_issue', 'bt_id'
             ]]
             df2 = df1.reset_index()
             df2.columns = [
                 'indx', 'SourceID', 'AttributeID', 'Category_no',
                 'be_att_dr_id', 'rule_id', 'is_issue', 'bt_id'
             ]
             df2 = df2.groupby([
                 'SourceID', 'AttributeID', 'Category_no', 'be_att_dr_id',
                 'rule_id', 'is_issue'
             ]).agg({'bt_id': ['count']})
             df2.columns = ["cells#"]
             with ProgressBar():
                 print(df2.compute())
             print(
                 "----------------------------------------------------------------------"
             )
     print(
         "**********************************************************************"
     )
예제 #8
0
파일: dq.py 프로젝트: OmarNour/dc_pyArrow
    def get_bt_current_data(self, src_f_data, bt_dataset, source_id,
                            category_no, be_att_id, join_with_f):
        complete_dataset = bt_dataset + \
                           "\\SourceID=" + str(source_id) +\
                           "\\AttributeID=" + str(be_att_id) +\
                           "\\ResetDQStage=" + str(category_no) +\
                           "\\process_no="+str(self.process_no)
        # print('complete_dataset', complete_dataset)
        if is_dir_exists(complete_dataset):
            # print('src_f_data', src_f_data.compute().columns)

            for file_name in get_files_in_dir(complete_dataset):
                pa_file_path = complete_dataset + "\\" + file_name
                # bt_current_df = read_batches_from_parquet(dataset_root_path=pa_file_path,
                #                                           columns=['bt_id', 'RowKey', 'AttributeValue'],
                #                                           batch_size=int(self.parameters_dict['bt_batch_size']),
                #                                           use_threads=True, filter=None, filter_index=True)
                bt_current_df = read_all_from_parquet_delayed(
                    dataset=pa_file_path,
                    columns=['bt_id', 'RowKey', 'AttributeValue'],
                    filter=None)

                # src_f_data_df = src_f_data[src_f_data['rowkey'].isin(bt_current_df['RowKey'])]
                ###############
                if join_with_f == 1:
                    bt_current_df = bt_current_df.merge(
                        src_f_data,
                        left_on=['RowKey'],
                        # left_index=True,
                        # right_index=True,
                        right_on=['rowkey'],
                        suffixes=('_new', '_cbt'),
                        how='inner')

                ######################
                yield bt_current_df.compute()