Exemplo n.º 1
0
 def convert_feature_to_bin(self, data_instance):
     LOGGER.info("convert feature to bins")
     param_obj = FeatureBinningParam(bin_num=self.bin_num)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(data_instance)
     self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(
         data_instance)
Exemplo n.º 2
0
 def test_new_sparse_quantile(self):
     param_obj = FeatureBinningParam(bin_num=4)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(self.sparse_table)
     data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table)
     bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
     for i in range(20):
         self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
Exemplo n.º 3
0
    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        if self.use_missing:
            binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()])
        else:
            binning_obj = QuantileBinning(param_obj)

        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
Exemplo n.º 4
0
    def test_new_dense_quantile(self):
        param_obj = FeatureBinningParam(bin_num=4)
        binning_obj = QuantileBinning(param_obj)
        binning_obj.fit_split_points(self.dense_table)
        data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table)
        bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
        # print(bin_result)
        for i in range(100):
            self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all())
            if i < 20:
                # col_name = 'x' + str(i)
                col_idx = i
                split_point = np.array(bin_splitpoints[col_idx])
                self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all())

        for split_points in bin_splitpoints:
            self.assertTrue(len(split_points) <= 4)
Exemplo n.º 5
0
    def fit(self, expect_table, actual_table):

        LOGGER.info('start psi computing')

        header1 = expect_table.schema['header']
        header2 = actual_table.schema['header']

        if not set(header1) == set(header2):
            raise ValueError(
                'table header must be the same while computing psi values')

        # baseline table should not contain empty columns
        abnormal_detection.empty_column_detection(expect_table)

        self.all_feature_list = header1

        # make sure no duplicate features
        self.all_feature_list = self.check_duplicates(self.all_feature_list)

        # kv bi-directional mapping
        self.tag_id_mapping = {
            v: k
            for k, v in enumerate(self.all_feature_list)
        }
        self.id_tag_mapping = {
            k: v
            for k, v in enumerate(self.all_feature_list)
        }

        if not self.is_sparse(
                expect_table):  # convert missing value: nan to NoneType
            expect_table = self.convert_missing_val(expect_table)

        if not self.is_sparse(
                actual_table):  # convert missing value: nan to NoneType
            actual_table = self.convert_missing_val(actual_table)

        if not (self.check_table_content(expect_table)
                and self.check_table_content(actual_table)):
            raise ValueError(
                'contents of input table must be instances of class "Instance"'
            )

        param = FeatureBinningParam(method=consts.QUANTILE,
                                    bin_num=self.max_bin_num,
                                    local_only=True,
                                    error=self.binning_error)
        binning_obj = QuantileBinning(params=param,
                                      abnormal_list=[NoneType()],
                                      allow_duplicate=False)
        binning_obj.fit_split_points(expect_table)

        data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin(
            expect_table)
        LOGGER.debug('bin split points is {}, shape is {}'.format(
            bin_split_points, bin_split_points.shape))
        self.binning_obj = binning_obj

        self.data_bin1 = data_bin
        self.bin_split_points = bin_split_points
        self.bin_sparse_points = bin_sparse_points
        LOGGER.debug('expect table binning done')

        count_func1 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin1))

        map_rs1 = self.data_bin1.applyPartitions(count_func1)
        count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce))

        data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin(
            actual_table)
        self.data_bin2 = data_bin2
        LOGGER.debug('actual table binning done')

        count_func2 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin2))

        map_rs2 = self.data_bin2.applyPartitions(count_func2)
        count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce))

        self.count1, self.count2 = count1, count2

        LOGGER.info('psi counting done')

        # compute psi from counting result
        psi_result = psi_computer(count1, count2, expect_table.count(),
                                  actual_table.count())
        self.psi_rs = psi_result

        # get total psi score of features
        total_scores = {}
        for idx, rs in enumerate(self.psi_rs):
            feat_name = self.id_tag_mapping[idx]
            total_scores[feat_name] = rs['total_psi']
        self.total_scores = total_scores

        # id-feature mapping convert, str interval computation
        self.str_intervals = self.get_string_interval(
            bin_split_points,
            self.id_tag_mapping,
            missing_bin_idx=self.max_bin_num)

        self.interval_perc1 = self.count_dict_to_percentage(
            copy.deepcopy(count1), expect_table.count())
        self.interval_perc2 = self.count_dict_to_percentage(
            copy.deepcopy(count2), actual_table.count())

        self.set_summary(self.generate_summary())
        LOGGER.info('psi computation done')