def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin( data_instance)
def test_new_sparse_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.sparse_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) for i in range(20): self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) if self.use_missing: binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()]) else: binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
def test_quantile_binning(self): error = 0.01 compress_thres = int(self.data_num / (self.data_num * error)) head_size = 5000 bin_num = 10 bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)] bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) t0 = time.time() split_points = quan_bin.fit_split_points(self.table) t1 = time.time() print('Spend time: {}'.format(t1 - t0)) # collect and test numpy quantile speed local_table = self.table.collect() total_data = [] for _, data_inst in local_table: total_data.append(data_inst.features) total_data = np.array(total_data) for col in self.cols: col_idx = self.col_dict.get(col) x = total_data[:, col_idx] sk = np.percentile(x, bin_percent, interpolation="midpoint") t2 = time.time() print('collect and use numpy time: {}'.format(t2 - t1))
def average_run(self, data_instances, bin_num=10, abnormal_list=None): if self.bin_param is None: bin_param = FeatureBinningParam(bin_num=bin_num) self.bin_param = bin_param else: bin_param = self.bin_param if self.bin_method == consts.QUANTILE: bin_obj = QuantileBinning(params=bin_param, abnormal_list=abnormal_list, allow_duplicate=True) else: raise ValueError( "H**o Split Point do not accept bin_method: {}".format( self.bin_method)) abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) split_points = bin_obj.fit_split_points(data_instances) split_points = {k: np.array(v) for k, v in split_points.items()} split_points_weights = DictWeights(d=split_points) self.aggregator.send_model(split_points_weights, self.suffix) dict_split_points = self.aggregator.get_aggregated_model(self.suffix) split_points = { k: list(v) for k, v in dict_split_points.unboxed.items() } self.bin_obj = bin_obj return split_points
def _get_quantile_median(self): bin_param = FeatureBinningParam(bin_num=2, cols=self.cols) binning_obj = QuantileBinning(bin_param) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def test_new_dense_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.dense_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) # print(bin_result) for i in range(100): self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all()) if i < 20: # col_name = 'x' + str(i) col_idx = i split_point = np.array(bin_splitpoints[col_idx]) self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all()) for split_points in bin_splitpoints: self.assertTrue(len(split_points) <= 4)
def _get_quantile_median(self): cols_index = self._get_cols_index() bin_param = FeatureBinningParam(bin_num=2, cols=cols_index) binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def test_quantile_binning(self): return compress_thres = 10000 head_size = 5000 error = 0.01 bin_num = 10 bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) split_points = quan_bin.fit_split_points(self.table) for col_idx, col in enumerate(self.cols): bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)] feature_idx = self.col_dict.get(col) x = self.numpy_table[:, feature_idx] x = sorted(x) for bin_idx, percent in enumerate(bin_percent): min_rank = int( math.floor(percent * self.data_num - self.data_num * error)) max_rank = int( math.ceil(percent * self.data_num + self.data_num * error)) if min_rank < 0: min_rank = 0 if max_rank > len(x) - 1: max_rank = len(x) - 1 try: self.assertTrue(x[min_rank] <= split_points[col_idx] [bin_idx] <= x[max_rank]) except: print(x[min_rank], x[max_rank], split_points[col_idx][bin_idx]) found_index = x.index(split_points[col_idx][bin_idx]) print("min_rank: {}, found_rank: {}, max_rank: {}".format( min_rank, found_index, max_rank)) self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx] <= x[max_rank])
class MultivariateStatisticalSummary(object): """ """ def __init__(self, data_instances, cols_index=-1, abnormal_list=None, error=consts.DEFAULT_RELATIVE_ERROR, stat_order=2, bias=True): self.finish_fit_statics = False # Use for static data # self.finish_fit_summaries = False # Use for quantile data self.binning_obj: QuantileBinning = None self.summary_statistics = None self.header = None # self.quantile_summary_dict = {} self.cols_dict = {} # self.medians = None self.data_instances = data_instances self.cols_index = None if not isinstance(abnormal_list, list): abnormal_list = [abnormal_list] self.abnormal_list = abnormal_list self.__init_cols(data_instances, cols_index, stat_order, bias) self.label_summary = None self.error = error def __init_cols(self, data_instances, cols_index, stat_order, bias): header = data_overview.get_header(data_instances) self.header = header if cols_index == -1: self.cols_index = [i for i in range(len(header))] else: self.cols_index = cols_index LOGGER.debug( f"col_index: {cols_index}, self.col_index: {self.cols_index}") self.cols_dict = { header[indices]: indices for indices in self.cols_index } self.summary_statistics = SummaryStatistics( length=len(self.cols_index), abnormal_list=self.abnormal_list, stat_order=stat_order, bias=bias) def _static_sums(self): """ Statics sum, sum_square, max_value, min_value, so that variance is available. """ is_sparse = data_overview.is_sparse_data(self.data_instances) partition_cal = functools.partial(self.static_in_partition, cols_index=self.cols_index, summary_statistics=copy.deepcopy( self.summary_statistics), is_sparse=is_sparse) self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \ reduce(lambda x, y: self.copy_merge(x, y)) # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics) self.finish_fit_statics = True def _static_quantile_summaries(self): """ Static summaries so that can query a specific quantile point """ if self.binning_obj is not None: return self.binning_obj bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index, error=self.error) self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) self.binning_obj.fit_split_points(self.data_instances) return self.binning_obj @staticmethod def copy_merge(s1, s2): new_s1 = copy.deepcopy(s1) return new_s1.merge(s2) @staticmethod def static_in_partition(data_instances, cols_index, summary_statistics, is_sparse): """ Statics sums, sum_square, max and min value through one traversal Parameters ---------- data_instances : DTable The input data cols_index : indices Specify which column(s) need to apply statistic. summary_statistics: SummaryStatistics Returns ------- Dict of SummaryStatistics object """ for k, instances in data_instances: if not is_sparse: if isinstance(instances, Instance): features = instances.features else: features = instances # try: # features = np.array(instances, dtype=float) # except ValueError as e: # raise ValueError(f"Static Module accept numeric input only. Error info: {e}") # LOGGER.debug(f"In statics, features: {features}") row_values = [ x for idx, x in enumerate(features) if idx in cols_index ] # row_values = features[cols_index] else: sparse_data = instances.features.get_sparse_vector() row_values = np.array( [sparse_data.get(x, 0) for x in cols_index]) summary_statistics.add_rows(row_values) return summary_statistics @staticmethod def static_summaries_in_partition(data_instances, cols_dict, abnormal_list, error): """ Statics sums, sum_square, max and min value through one traversal Parameters ---------- data_instances : DTable The input data cols_dict : dict Specify which column(s) need to apply statistic. abnormal_list: list Specify which values are not permitted. Returns ------- Dict of SummaryStatistics object """ summary_dict = {} for col_name in cols_dict: summary_dict[col_name] = QuantileSummaries( abnormal_list=abnormal_list, error=error) for k, instances in data_instances: if isinstance(instances, Instance): features = instances.features else: features = instances for col_name, col_index in cols_dict.items(): value = features[col_index] summary_obj = summary_dict[col_name] summary_obj.insert(value) return summary_dict @staticmethod def aggregate_statics(s_dict1, s_dict2): if s_dict1 is None and s_dict2 is None: return None if s_dict1 is None: return s_dict2 if s_dict2 is None: return s_dict1 new_dict = {} for col_name, static_1 in s_dict1.items(): static_1.merge(s_dict2[col_name]) new_dict[col_name] = static_1 return new_dict def get_median(self): if self.binning_obj is None: self._static_quantile_summaries() medians = self.binning_obj.query_quantile_point(query_points=0.5) return medians @property def median(self): median_dict = self.get_median() return np.array( [median_dict[self.header[idx]] for idx in self.cols_index]) def get_quantile_point(self, quantile): """ Return the specific quantile point value Parameters ---------- quantile : float, 0 <= quantile <= 1 Specify which column(s) need to apply statistic. Returns ------- return a dict of result quantile points. eg. quantile_point = {"x1": 3, "x2": 5... } """ if self.binning_obj is None: self._static_quantile_summaries() quantile_points = self.binning_obj.query_quantile_point(quantile) return quantile_points def get_mean(self): """ Return the mean value(s) of the given column Returns ------- return a dict of result mean. """ return self.get_statics("mean") def get_variance(self): return self.get_statics("variance") def get_std_variance(self): return self.get_statics("stddev") def get_max(self): return self.get_statics("max_value") def get_min(self): return self.get_statics("min_value") def get_statics(self, data_type): """ Return the specific static value(s) of the given column Parameters ---------- data_type : str, "mean", "variance", "std_variance", "max_value" or "mim_value" Specify which type to show. Returns ------- return a list of result result. The order is the same as cols. """ if not self.finish_fit_statics: self._static_sums() if hasattr(self.summary_statistics, data_type): result_row = getattr(self.summary_statistics, data_type) elif hasattr(self, data_type): result_row = getattr(self, data_type) else: raise ValueError( f"Statistic data type: {data_type} cannot be recognized") # LOGGER.debug(f"col_index: {self.cols_index}, result_row: {result_row}," # f"header: {self.header}, data_type: {data_type}") result = {} result_row = result_row.tolist() for col_idx, header_idx in enumerate(self.cols_index): result[self.header[header_idx]] = result_row[col_idx] return result def get_missing_ratio(self): return self.get_statics("missing_ratio") @property def missing_ratio(self): missing_static_obj = MissingStatistic() all_missing_ratio = missing_static_obj.fit(self.data_instances) return np.array( [all_missing_ratio[self.header[idx]] for idx in self.cols_index]) @property def missing_count(self): missing_ratio = self.missing_ratio missing_count = missing_ratio * self.data_instances.count() return missing_count.astype(int) @staticmethod def get_label_static_dict(data_instances): result_dict = {} for instance in data_instances: label_key = instance[1].label if label_key not in result_dict: result_dict[label_key] = 1 else: result_dict[label_key] += 1 return result_dict @staticmethod def merge_result_dict(dict_a, dict_b): for k, v in dict_b.items(): if k in dict_a: dict_a[k] += v else: dict_a[k] = v return dict_a def get_label_histogram(self): label_histogram = self.data_instances.applyPartitions( self.get_label_static_dict).reduce(self.merge_result_dict) return label_histogram
def fit(self, expect_table, actual_table): LOGGER.info('start psi computing') header1 = expect_table.schema['header'] header2 = actual_table.schema['header'] if not set(header1) == set(header2): raise ValueError( 'table header must be the same while computing psi values') # baseline table should not contain empty columns abnormal_detection.empty_column_detection(expect_table) self.all_feature_list = header1 # make sure no duplicate features self.all_feature_list = self.check_duplicates(self.all_feature_list) # kv bi-directional mapping self.tag_id_mapping = { v: k for k, v in enumerate(self.all_feature_list) } self.id_tag_mapping = { k: v for k, v in enumerate(self.all_feature_list) } if not self.is_sparse( expect_table): # convert missing value: nan to NoneType expect_table = self.convert_missing_val(expect_table) if not self.is_sparse( actual_table): # convert missing value: nan to NoneType actual_table = self.convert_missing_val(actual_table) if not (self.check_table_content(expect_table) and self.check_table_content(actual_table)): raise ValueError( 'contents of input table must be instances of class "Instance"' ) param = FeatureBinningParam(method=consts.QUANTILE, bin_num=self.max_bin_num, local_only=True, error=self.binning_error) binning_obj = QuantileBinning(params=param, abnormal_list=[NoneType()], allow_duplicate=False) binning_obj.fit_split_points(expect_table) data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin( expect_table) LOGGER.debug('bin split points is {}, shape is {}'.format( bin_split_points, bin_split_points.shape)) self.binning_obj = binning_obj self.data_bin1 = data_bin self.bin_split_points = bin_split_points self.bin_sparse_points = bin_sparse_points LOGGER.debug('expect table binning done') count_func1 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin1)) map_rs1 = self.data_bin1.applyPartitions(count_func1) count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce)) data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin( actual_table) self.data_bin2 = data_bin2 LOGGER.debug('actual table binning done') count_func2 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin2)) map_rs2 = self.data_bin2.applyPartitions(count_func2) count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce)) self.count1, self.count2 = count1, count2 LOGGER.info('psi counting done') # compute psi from counting result psi_result = psi_computer(count1, count2, expect_table.count(), actual_table.count()) self.psi_rs = psi_result # get total psi score of features total_scores = {} for idx, rs in enumerate(self.psi_rs): feat_name = self.id_tag_mapping[idx] total_scores[feat_name] = rs['total_psi'] self.total_scores = total_scores # id-feature mapping convert, str interval computation self.str_intervals = self.get_string_interval( bin_split_points, self.id_tag_mapping, missing_bin_idx=self.max_bin_num) self.interval_perc1 = self.count_dict_to_percentage( copy.deepcopy(count1), expect_table.count()) self.interval_perc2 = self.count_dict_to_percentage( copy.deepcopy(count2), actual_table.count()) self.set_summary(self.generate_summary()) LOGGER.info('psi computation done')
class HomoFeatureBinningClient(object): def __init__(self, bin_method=consts.QUANTILE): self.aggregator = secure_mean_aggregator.Client( enable_secure_aggregate=True) self.suffix = tuple() self.bin_method = bin_method self.bin_obj: QuantileBinning = None self.bin_param = None self.abnormal_list = None def set_suffix(self, suffix): self.suffix = suffix def average_run(self, data_instances, bin_num=10, abnormal_list=None): if self.bin_param is None: bin_param = FeatureBinningParam(bin_num=bin_num) self.bin_param = bin_param else: bin_param = self.bin_param if self.bin_method == consts.QUANTILE: bin_obj = QuantileBinning(params=bin_param, abnormal_list=abnormal_list, allow_duplicate=True) else: raise ValueError( "H**o Split Point do not accept bin_method: {}".format( self.bin_method)) abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) split_points = bin_obj.fit_split_points(data_instances) split_points = {k: np.array(v) for k, v in split_points.items()} split_points_weights = DictWeights(d=split_points) self.aggregator.send_model(split_points_weights, self.suffix) dict_split_points = self.aggregator.get_aggregated_model(self.suffix) split_points = { k: list(v) for k, v in dict_split_points.unboxed.items() } self.bin_obj = bin_obj return split_points def convert_feature_to_bin(self, data_instances, split_points=None): if self.bin_obj is None: return None, None, None return self.bin_obj.convert_feature_to_bin(data_instances, split_points) def set_bin_param(self, bin_param: FeatureBinningParam): if self.bin_param is not None: raise RuntimeError("Bin param has been set and it's immutable") self.bin_param = bin_param return self def set_abnormal_list(self, abnormal_list): self.abnormal_list = abnormal_list return self def fit(self, data_instances): if self.bin_obj is not None: return self if self.bin_param is None: self.bin_param = FeatureBinningParam() self.bin_obj = QuantileBinning(params=self.bin_param, abnormal_list=self.abnormal_list, allow_duplicate=True) self.bin_obj.fit_split_points(data_instances) return self def query_quantile_points(self, data_instances, quantile_points): if self.bin_obj is None: self.fit(data_instances) # bin_col_names = self.bin_obj.bin_inner_param.bin_names query_result = self.bin_obj.query_quantile_point(quantile_points) query_points = DictWeights(d=query_result) suffix = tuple(list(self.suffix) + [str(quantile_points)]) self.aggregator.send_model(query_points, suffix) query_points = self.aggregator.get_aggregated_model(suffix) query_points = {k: v for k, v in query_points.unboxed.items()} return query_points