def test_quantile_binning(self): error = 0.01 compress_thres = int(self.data_num / (self.data_num * error)) head_size = 5000 bin_num = 10 bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)] bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) t0 = time.time() split_points = quan_bin.fit_split_points(self.table) t1 = time.time() print('Spend time: {}'.format(t1 - t0)) # collect and test numpy quantile speed local_table = self.table.collect() total_data = [] for _, data_inst in local_table: total_data.append(data_inst.features) total_data = np.array(total_data) for col in self.cols: col_idx = self.col_dict.get(col) x = total_data[:, col_idx] sk = np.percentile(x, bin_percent, interpolation="midpoint") t2 = time.time() print('collect and use numpy time: {}'.format(t2 - t1))
def _get_quantile_median(self): bin_param = FeatureBinningParam(bin_num=2, cols=self.cols) binning_obj = QuantileBinning(bin_param) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def init_previous_model(self, **models): if 'binning_model' in models: binning_model_params = models.get('binning_model') binning_param = FeatureBinningParam() if self.party_name == consts.GUEST: binning_obj = HeteroFeatureBinningGuest(binning_param) else: binning_obj = HeteroFeatureBinningHost(binning_param) name = binning_model_params.get('name') namespace = binning_model_params.get('namespace') binning_obj.load_model(name, namespace) self.binning_model = binning_obj
def filter(self, data_instances, bin_param=None): if bin_param is None: # Use default setting bin_param = FeatureBinningParam() bin_obj = QuantileBinning(bin_param) query_result = bin_obj.query_quantile_point(data_instances, self.select_cols, self.percentile) left_cols = [] for idx, q_r in enumerate(query_result): if q_r < self.upper_threshold: left_cols.append(self.select_cols[idx]) left_cols = self._keep_one_feature(self.select_cols, left_cols) self.left_cols = left_cols return left_cols
def fit(self, data_instances, bin_param=None): if bin_param is None: # Use default setting bin_param = FeatureBinningParam() bin_obj = QuantileBinning(bin_param) query_result = bin_obj.query_quantile_point(data_instances, self.cols, self.percentile) for col_name, feature_value in query_result.items(): self.feature_values[col_name] = feature_value if feature_value < self.upper_threshold: self.left_cols[col_name] = True else: self.left_cols[col_name] = False self.left_cols = self._keep_one_feature() return self.left_cols
def test_quantile_binning(self): return compress_thres = 10000 head_size = 5000 error = 0.01 bin_num = 10 bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) split_points = quan_bin.fit_split_points(self.table) for col_idx, col in enumerate(self.cols): bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)] feature_idx = self.col_dict.get(col) x = self.numpy_table[:, feature_idx] x = sorted(x) for bin_idx, percent in enumerate(bin_percent): min_rank = int( math.floor(percent * self.data_num - self.data_num * error)) max_rank = int( math.ceil(percent * self.data_num + self.data_num * error)) if min_rank < 0: min_rank = 0 if max_rank > len(x) - 1: max_rank = len(x) - 1 try: self.assertTrue(x[min_rank] <= split_points[col_idx] [bin_idx] <= x[max_rank]) except: print(x[min_rank], x[max_rank], split_points[col_idx][bin_idx]) found_index = x.index(split_points[col_idx][bin_idx]) print("min_rank: {}, found_rank: {}, max_rank: {}".format( min_rank, found_index, max_rank)) self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx] <= x[max_rank])
def _get_quantile_median(self, cols): bin_param = FeatureBinningParam(bin_num=2) binning_obj = QuantileBinning(bin_param) split_points = binning_obj.binning(self.data_instances, cols) medians = [x[0] for x in split_points] return medians
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) print(split_points)