Exemplo n.º 1
0
 def convert_feature_to_bin(self, data_instance):
     LOGGER.info("convert feature to bins")
     param_obj = FeatureBinningParam(bin_num=self.bin_num)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(data_instance)
     self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(
         data_instance)
Exemplo n.º 2
0
 def test_new_sparse_quantile(self):
     param_obj = FeatureBinningParam(bin_num=4)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(self.sparse_table)
     data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table)
     bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
     for i in range(20):
         self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
Exemplo n.º 3
0
    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        if self.use_missing:
            binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()])
        else:
            binning_obj = QuantileBinning(param_obj)

        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
Exemplo n.º 4
0
    def test_quantile_binning(self):
        error = 0.01
        compress_thres = int(self.data_num / (self.data_num * error))

        head_size = 5000
        bin_num = 10
        bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)]

        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        t0 = time.time()
        split_points = quan_bin.fit_split_points(self.table)
        t1 = time.time()
        print('Spend time: {}'.format(t1 - t0))

        # collect and test numpy quantile speed
        local_table = self.table.collect()
        total_data = []
        for _, data_inst in local_table:
            total_data.append(data_inst.features)
        total_data = np.array(total_data)
        for col in self.cols:
            col_idx = self.col_dict.get(col)
            x = total_data[:, col_idx]
            sk = np.percentile(x, bin_percent, interpolation="midpoint")
        t2 = time.time()
        print('collect and use numpy time: {}'.format(t2 - t1))
Exemplo n.º 5
0
    def average_run(self, data_instances, bin_num=10, abnormal_list=None):
        if self.bin_param is None:
            bin_param = FeatureBinningParam(bin_num=bin_num)
            self.bin_param = bin_param
        else:
            bin_param = self.bin_param

        if self.bin_method == consts.QUANTILE:
            bin_obj = QuantileBinning(params=bin_param,
                                      abnormal_list=abnormal_list,
                                      allow_duplicate=True)
        else:
            raise ValueError(
                "H**o Split Point do not accept bin_method: {}".format(
                    self.bin_method))

        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

        split_points = bin_obj.fit_split_points(data_instances)
        split_points = {k: np.array(v) for k, v in split_points.items()}
        split_points_weights = DictWeights(d=split_points)

        self.aggregator.send_model(split_points_weights, self.suffix)
        dict_split_points = self.aggregator.get_aggregated_model(self.suffix)
        split_points = {
            k: list(v)
            for k, v in dict_split_points.unboxed.items()
        }
        self.bin_obj = bin_obj
        return split_points
Exemplo n.º 6
0
 def _get_quantile_median(self):
     bin_param = FeatureBinningParam(bin_num=2, cols=self.cols)
     binning_obj = QuantileBinning(bin_param)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
Exemplo n.º 7
0
    def test_new_dense_quantile(self):
        param_obj = FeatureBinningParam(bin_num=4)
        binning_obj = QuantileBinning(param_obj)
        binning_obj.fit_split_points(self.dense_table)
        data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table)
        bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
        # print(bin_result)
        for i in range(100):
            self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all())
            if i < 20:
                # col_name = 'x' + str(i)
                col_idx = i
                split_point = np.array(bin_splitpoints[col_idx])
                self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all())

        for split_points in bin_splitpoints:
            self.assertTrue(len(split_points) <= 4)
Exemplo n.º 8
0
 def _get_quantile_median(self):
     cols_index = self._get_cols_index()
     bin_param = FeatureBinningParam(bin_num=2, cols=cols_index)
     binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
Exemplo n.º 9
0
    def test_quantile_binning(self):
        return

        compress_thres = 10000
        head_size = 5000
        error = 0.01
        bin_num = 10
        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        split_points = quan_bin.fit_split_points(self.table)
        for col_idx, col in enumerate(self.cols):
            bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)]
            feature_idx = self.col_dict.get(col)
            x = self.numpy_table[:, feature_idx]
            x = sorted(x)
            for bin_idx, percent in enumerate(bin_percent):
                min_rank = int(
                    math.floor(percent * self.data_num -
                               self.data_num * error))
                max_rank = int(
                    math.ceil(percent * self.data_num + self.data_num * error))
                if min_rank < 0:
                    min_rank = 0
                if max_rank > len(x) - 1:
                    max_rank = len(x) - 1
                try:
                    self.assertTrue(x[min_rank] <= split_points[col_idx]
                                    [bin_idx] <= x[max_rank])
                except:
                    print(x[min_rank], x[max_rank],
                          split_points[col_idx][bin_idx])
                    found_index = x.index(split_points[col_idx][bin_idx])
                    print("min_rank: {}, found_rank: {}, max_rank: {}".format(
                        min_rank, found_index, max_rank))
                self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx]
                                <= x[max_rank])
Exemplo n.º 10
0
class MultivariateStatisticalSummary(object):
    """

    """
    def __init__(self,
                 data_instances,
                 cols_index=-1,
                 abnormal_list=None,
                 error=consts.DEFAULT_RELATIVE_ERROR,
                 stat_order=2,
                 bias=True):
        self.finish_fit_statics = False  # Use for static data
        # self.finish_fit_summaries = False   # Use for quantile data
        self.binning_obj: QuantileBinning = None
        self.summary_statistics = None
        self.header = None
        # self.quantile_summary_dict = {}
        self.cols_dict = {}
        # self.medians = None
        self.data_instances = data_instances
        self.cols_index = None
        if not isinstance(abnormal_list, list):
            abnormal_list = [abnormal_list]

        self.abnormal_list = abnormal_list
        self.__init_cols(data_instances, cols_index, stat_order, bias)
        self.label_summary = None
        self.error = error

    def __init_cols(self, data_instances, cols_index, stat_order, bias):
        header = data_overview.get_header(data_instances)
        self.header = header
        if cols_index == -1:
            self.cols_index = [i for i in range(len(header))]
        else:
            self.cols_index = cols_index
        LOGGER.debug(
            f"col_index: {cols_index}, self.col_index: {self.cols_index}")
        self.cols_dict = {
            header[indices]: indices
            for indices in self.cols_index
        }
        self.summary_statistics = SummaryStatistics(
            length=len(self.cols_index),
            abnormal_list=self.abnormal_list,
            stat_order=stat_order,
            bias=bias)

    def _static_sums(self):
        """
        Statics sum, sum_square, max_value, min_value,
        so that variance is available.
        """
        is_sparse = data_overview.is_sparse_data(self.data_instances)
        partition_cal = functools.partial(self.static_in_partition,
                                          cols_index=self.cols_index,
                                          summary_statistics=copy.deepcopy(
                                              self.summary_statistics),
                                          is_sparse=is_sparse)
        self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \
            reduce(lambda x, y: self.copy_merge(x, y))
        # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics)
        self.finish_fit_statics = True

    def _static_quantile_summaries(self):
        """
        Static summaries so that can query a specific quantile point
        """
        if self.binning_obj is not None:
            return self.binning_obj
        bin_param = FeatureBinningParam(bin_num=2,
                                        bin_indexes=self.cols_index,
                                        error=self.error)
        self.binning_obj = QuantileBinning(bin_param,
                                           abnormal_list=self.abnormal_list)
        self.binning_obj.fit_split_points(self.data_instances)

        return self.binning_obj

    @staticmethod
    def copy_merge(s1, s2):
        new_s1 = copy.deepcopy(s1)
        return new_s1.merge(s2)

    @staticmethod
    def static_in_partition(data_instances, cols_index, summary_statistics,
                            is_sparse):
        """
        Statics sums, sum_square, max and min value through one traversal

        Parameters
        ----------
        data_instances : DTable
            The input data

        cols_index : indices
            Specify which column(s) need to apply statistic.

        summary_statistics: SummaryStatistics

        Returns
        -------
        Dict of SummaryStatistics object

        """

        for k, instances in data_instances:
            if not is_sparse:
                if isinstance(instances, Instance):
                    features = instances.features
                else:
                    features = instances
                    # try:
                    #     features = np.array(instances, dtype=float)
                    # except ValueError as e:
                    #     raise ValueError(f"Static Module accept numeric input only. Error info: {e}")
                # LOGGER.debug(f"In statics, features: {features}")
                row_values = [
                    x for idx, x in enumerate(features) if idx in cols_index
                ]
                # row_values = features[cols_index]
            else:
                sparse_data = instances.features.get_sparse_vector()
                row_values = np.array(
                    [sparse_data.get(x, 0) for x in cols_index])
            summary_statistics.add_rows(row_values)
        return summary_statistics

    @staticmethod
    def static_summaries_in_partition(data_instances, cols_dict, abnormal_list,
                                      error):
        """
        Statics sums, sum_square, max and min value through one traversal

        Parameters
        ----------
        data_instances : DTable
            The input data

        cols_dict : dict
            Specify which column(s) need to apply statistic.

        abnormal_list: list
            Specify which values are not permitted.

        Returns
        -------
        Dict of SummaryStatistics object

        """
        summary_dict = {}
        for col_name in cols_dict:
            summary_dict[col_name] = QuantileSummaries(
                abnormal_list=abnormal_list, error=error)

        for k, instances in data_instances:
            if isinstance(instances, Instance):
                features = instances.features
            else:
                features = instances

            for col_name, col_index in cols_dict.items():
                value = features[col_index]
                summary_obj = summary_dict[col_name]
                summary_obj.insert(value)

        return summary_dict

    @staticmethod
    def aggregate_statics(s_dict1, s_dict2):
        if s_dict1 is None and s_dict2 is None:
            return None
        if s_dict1 is None:
            return s_dict2
        if s_dict2 is None:
            return s_dict1

        new_dict = {}
        for col_name, static_1 in s_dict1.items():
            static_1.merge(s_dict2[col_name])
            new_dict[col_name] = static_1
        return new_dict

    def get_median(self):
        if self.binning_obj is None:
            self._static_quantile_summaries()

        medians = self.binning_obj.query_quantile_point(query_points=0.5)
        return medians

    @property
    def median(self):
        median_dict = self.get_median()
        return np.array(
            [median_dict[self.header[idx]] for idx in self.cols_index])

    def get_quantile_point(self, quantile):
        """
        Return the specific quantile point value

        Parameters
        ----------
        quantile : float, 0 <= quantile <= 1
            Specify which column(s) need to apply statistic.

        Returns
        -------
        return a dict of result quantile points.
        eg.
        quantile_point = {"x1": 3, "x2": 5... }
        """

        if self.binning_obj is None:
            self._static_quantile_summaries()
        quantile_points = self.binning_obj.query_quantile_point(quantile)
        return quantile_points

    def get_mean(self):
        """
        Return the mean value(s) of the given column

        Returns
        -------
        return a dict of result mean.

        """
        return self.get_statics("mean")

    def get_variance(self):
        return self.get_statics("variance")

    def get_std_variance(self):
        return self.get_statics("stddev")

    def get_max(self):
        return self.get_statics("max_value")

    def get_min(self):
        return self.get_statics("min_value")

    def get_statics(self, data_type):
        """
        Return the specific static value(s) of the given column

        Parameters
        ----------
        data_type : str, "mean", "variance", "std_variance", "max_value" or "mim_value"
            Specify which type to show.

        Returns
        -------
        return a list of result result. The order is the same as cols.
        """
        if not self.finish_fit_statics:
            self._static_sums()

        if hasattr(self.summary_statistics, data_type):
            result_row = getattr(self.summary_statistics, data_type)

        elif hasattr(self, data_type):
            result_row = getattr(self, data_type)
        else:
            raise ValueError(
                f"Statistic data type: {data_type} cannot be recognized")
        # LOGGER.debug(f"col_index: {self.cols_index}, result_row: {result_row},"
        #              f"header: {self.header}, data_type: {data_type}")

        result = {}

        result_row = result_row.tolist()
        for col_idx, header_idx in enumerate(self.cols_index):
            result[self.header[header_idx]] = result_row[col_idx]
        return result

    def get_missing_ratio(self):
        return self.get_statics("missing_ratio")

    @property
    def missing_ratio(self):
        missing_static_obj = MissingStatistic()
        all_missing_ratio = missing_static_obj.fit(self.data_instances)
        return np.array(
            [all_missing_ratio[self.header[idx]] for idx in self.cols_index])

    @property
    def missing_count(self):
        missing_ratio = self.missing_ratio
        missing_count = missing_ratio * self.data_instances.count()
        return missing_count.astype(int)

    @staticmethod
    def get_label_static_dict(data_instances):
        result_dict = {}
        for instance in data_instances:
            label_key = instance[1].label
            if label_key not in result_dict:
                result_dict[label_key] = 1
            else:
                result_dict[label_key] += 1
        return result_dict

    @staticmethod
    def merge_result_dict(dict_a, dict_b):
        for k, v in dict_b.items():
            if k in dict_a:
                dict_a[k] += v
            else:
                dict_a[k] = v
        return dict_a

    def get_label_histogram(self):
        label_histogram = self.data_instances.applyPartitions(
            self.get_label_static_dict).reduce(self.merge_result_dict)
        return label_histogram
Exemplo n.º 11
0
    def fit(self, expect_table, actual_table):

        LOGGER.info('start psi computing')

        header1 = expect_table.schema['header']
        header2 = actual_table.schema['header']

        if not set(header1) == set(header2):
            raise ValueError(
                'table header must be the same while computing psi values')

        # baseline table should not contain empty columns
        abnormal_detection.empty_column_detection(expect_table)

        self.all_feature_list = header1

        # make sure no duplicate features
        self.all_feature_list = self.check_duplicates(self.all_feature_list)

        # kv bi-directional mapping
        self.tag_id_mapping = {
            v: k
            for k, v in enumerate(self.all_feature_list)
        }
        self.id_tag_mapping = {
            k: v
            for k, v in enumerate(self.all_feature_list)
        }

        if not self.is_sparse(
                expect_table):  # convert missing value: nan to NoneType
            expect_table = self.convert_missing_val(expect_table)

        if not self.is_sparse(
                actual_table):  # convert missing value: nan to NoneType
            actual_table = self.convert_missing_val(actual_table)

        if not (self.check_table_content(expect_table)
                and self.check_table_content(actual_table)):
            raise ValueError(
                'contents of input table must be instances of class "Instance"'
            )

        param = FeatureBinningParam(method=consts.QUANTILE,
                                    bin_num=self.max_bin_num,
                                    local_only=True,
                                    error=self.binning_error)
        binning_obj = QuantileBinning(params=param,
                                      abnormal_list=[NoneType()],
                                      allow_duplicate=False)
        binning_obj.fit_split_points(expect_table)

        data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin(
            expect_table)
        LOGGER.debug('bin split points is {}, shape is {}'.format(
            bin_split_points, bin_split_points.shape))
        self.binning_obj = binning_obj

        self.data_bin1 = data_bin
        self.bin_split_points = bin_split_points
        self.bin_sparse_points = bin_sparse_points
        LOGGER.debug('expect table binning done')

        count_func1 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin1))

        map_rs1 = self.data_bin1.applyPartitions(count_func1)
        count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce))

        data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin(
            actual_table)
        self.data_bin2 = data_bin2
        LOGGER.debug('actual table binning done')

        count_func2 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin2))

        map_rs2 = self.data_bin2.applyPartitions(count_func2)
        count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce))

        self.count1, self.count2 = count1, count2

        LOGGER.info('psi counting done')

        # compute psi from counting result
        psi_result = psi_computer(count1, count2, expect_table.count(),
                                  actual_table.count())
        self.psi_rs = psi_result

        # get total psi score of features
        total_scores = {}
        for idx, rs in enumerate(self.psi_rs):
            feat_name = self.id_tag_mapping[idx]
            total_scores[feat_name] = rs['total_psi']
        self.total_scores = total_scores

        # id-feature mapping convert, str interval computation
        self.str_intervals = self.get_string_interval(
            bin_split_points,
            self.id_tag_mapping,
            missing_bin_idx=self.max_bin_num)

        self.interval_perc1 = self.count_dict_to_percentage(
            copy.deepcopy(count1), expect_table.count())
        self.interval_perc2 = self.count_dict_to_percentage(
            copy.deepcopy(count2), actual_table.count())

        self.set_summary(self.generate_summary())
        LOGGER.info('psi computation done')
Exemplo n.º 12
0
class HomoFeatureBinningClient(object):
    def __init__(self, bin_method=consts.QUANTILE):
        self.aggregator = secure_mean_aggregator.Client(
            enable_secure_aggregate=True)
        self.suffix = tuple()
        self.bin_method = bin_method
        self.bin_obj: QuantileBinning = None
        self.bin_param = None
        self.abnormal_list = None

    def set_suffix(self, suffix):
        self.suffix = suffix

    def average_run(self, data_instances, bin_num=10, abnormal_list=None):
        if self.bin_param is None:
            bin_param = FeatureBinningParam(bin_num=bin_num)
            self.bin_param = bin_param
        else:
            bin_param = self.bin_param

        if self.bin_method == consts.QUANTILE:
            bin_obj = QuantileBinning(params=bin_param,
                                      abnormal_list=abnormal_list,
                                      allow_duplicate=True)
        else:
            raise ValueError(
                "H**o Split Point do not accept bin_method: {}".format(
                    self.bin_method))

        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

        split_points = bin_obj.fit_split_points(data_instances)
        split_points = {k: np.array(v) for k, v in split_points.items()}
        split_points_weights = DictWeights(d=split_points)

        self.aggregator.send_model(split_points_weights, self.suffix)
        dict_split_points = self.aggregator.get_aggregated_model(self.suffix)
        split_points = {
            k: list(v)
            for k, v in dict_split_points.unboxed.items()
        }
        self.bin_obj = bin_obj
        return split_points

    def convert_feature_to_bin(self, data_instances, split_points=None):
        if self.bin_obj is None:
            return None, None, None
        return self.bin_obj.convert_feature_to_bin(data_instances,
                                                   split_points)

    def set_bin_param(self, bin_param: FeatureBinningParam):
        if self.bin_param is not None:
            raise RuntimeError("Bin param has been set and it's immutable")
        self.bin_param = bin_param
        return self

    def set_abnormal_list(self, abnormal_list):
        self.abnormal_list = abnormal_list
        return self

    def fit(self, data_instances):
        if self.bin_obj is not None:
            return self

        if self.bin_param is None:
            self.bin_param = FeatureBinningParam()

        self.bin_obj = QuantileBinning(params=self.bin_param,
                                       abnormal_list=self.abnormal_list,
                                       allow_duplicate=True)
        self.bin_obj.fit_split_points(data_instances)
        return self

    def query_quantile_points(self, data_instances, quantile_points):
        if self.bin_obj is None:
            self.fit(data_instances)

        # bin_col_names = self.bin_obj.bin_inner_param.bin_names
        query_result = self.bin_obj.query_quantile_point(quantile_points)

        query_points = DictWeights(d=query_result)

        suffix = tuple(list(self.suffix) + [str(quantile_points)])
        self.aggregator.send_model(query_points, suffix)
        query_points = self.aggregator.get_aggregated_model(suffix)
        query_points = {k: v for k, v in query_points.unboxed.items()}
        return query_points