def traverse_tree(self, data_inst: Instance, tree: List[Node], use_missing=True, zero_as_missing=True): nid = 0# root node id while True: if tree[nid].is_leaf: return tree[nid].weight cur_node = tree[nid] fid,bid = cur_node.fid,cur_node.bid missing_dir = cur_node.missing_dir if use_missing and zero_as_missing: if data_inst.features.get_data(fid) == NoneType() or data_inst.features.get_data(fid, None) is None: nid = tree[nid].right_nodeid if missing_dir == 1 else tree[nid].left_nodeid elif data_inst.features.get_data(fid) <= bid: nid = tree[nid].left_nodeid else: nid = tree[nid].right_nodeid elif data_inst.features.get_data(fid) == NoneType(): nid = tree[nid].right_nodeid if missing_dir == 1 else tree[nid].left_nodeid elif data_inst.features.get_data(fid, 0) <= bid: nid = tree[nid].left_nodeid else: nid = tree[nid].right_nodeid
def host_local_traverse_tree(data_inst, tree_node, use_missing=True, zero_as_missing=True): nid = 0 # root node id while True: if tree_node[nid].is_leaf: return nid cur_node = tree_node[nid] fid, bid = cur_node.fid, cur_node.bid missing_dir = cur_node.missing_dir if use_missing and zero_as_missing: if data_inst.features.get_data(fid) == NoneType() or data_inst.features.get_data(fid, None) is None: nid = tree_node[nid].right_nodeid if missing_dir == 1 else tree_node[nid].left_nodeid elif data_inst.features.get_data(fid) <= bid: nid = tree_node[nid].left_nodeid else: nid = tree_node[nid].right_nodeid elif data_inst.features.get_data(fid) == NoneType(): nid = tree_node[nid].right_nodeid if missing_dir == 1 else tree_node[nid].left_nodeid elif data_inst.features.get_data(fid, 0) <= bid: nid = tree_node[nid].left_nodeid else: nid = tree_node[nid].right_nodeid
def data_format_transform(row): if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.isnan(row.features[i]): indices.append(i) data.append(NoneType()) elif np.abs(row.features[i]) < consts.FLOAT_ZERO: continue else: indices.append(i) data.append(row.features[i]) row.features = SparseVector(indices, data, feature_shape) else: sparse_vec = row.features.get_sparse_vector() for key in sparse_vec: if sparse_vec.get(key) == NoneType() or np.isnan( sparse_vec.get(key)): sparse_vec[key] = NoneType() row.features.set_sparse_vector(sparse_vec) return row
def assign_a_instance(row, tree: List[Node], bin_sparse_point, use_missing, use_zero_as_missing): leaf_status, nodeid = row[1] node = tree[nodeid] if node.is_leaf: return node.weight fid = node.fid bid = node.bid missing_dir = node.missing_dir missing_val = False if use_zero_as_missing: if row[0].features.get_data(fid, None) is None or \ row[0].features.get_data(fid) == NoneType(): missing_val = True elif use_missing and row[0].features.get_data(fid) == NoneType(): missing_val = True if missing_val: if missing_dir == 1: return 1, tree[nodeid].right_nodeid else: return 1, tree[nodeid].left_nodeid else: if row[0].features.get_data(fid, bin_sparse_point[fid]) <= bid: return 1, tree[nodeid].left_nodeid else: return 1, tree[nodeid].right_nodeid
def make_decision(data_inst, fid, bid, missing_dir, use_missing, zero_as_missing, zero_val=0): left, right = True, False missing_dir = left if missing_dir == -1 else right # use missing and zero as missing if use_missing and zero_as_missing: # missing or zero if data_inst.features.get_data(fid) == NoneType( ) or data_inst.features.get_data(fid, None) is None: return missing_dir # is missing feat if data_inst.features.get_data(fid) == NoneType(): return missing_dir # no missing val feat_val = data_inst.features.get_data(fid, zero_val) direction = left if feat_val <= bid + consts.FLOAT_ZERO else right return direction
def traverse_tree(predict_state, data_inst, tree_=None, decoder=None, sitename=consts.GUEST, split_maskdict=None, use_missing=None, zero_as_missing=None, missing_dir_maskdict=None): nid, tag = predict_state while tree_[nid].sitename == sitename: if tree_[nid].is_leaf is True: return tree_[nid].weight fid = decoder("feature_idx", tree_[nid].fid, split_maskdict=split_maskdict) bid = decoder("feature_val", tree_[nid].bid, nid, split_maskdict=split_maskdict) if use_missing: missing_dir = decoder( "missing_dir", 1, nid, missing_dir_maskdict=missing_dir_maskdict) else: missing_dir = 1 if use_missing and zero_as_missing: missing_dir = decoder( "missing_dir", 1, nid, missing_dir_maskdict=missing_dir_maskdict) if data_inst.features.get_data(fid) == NoneType( ) or data_inst.features.get_data(fid, None) is None: if missing_dir == 1: nid = tree_[nid].right_nodeid else: nid = tree_[nid].left_nodeid elif data_inst.features.get_data(fid) <= bid: nid = tree_[nid].left_nodeid else: nid = tree_[nid].right_nodeid elif data_inst.features.get_data(fid) == NoneType(): if missing_dir == 1: nid = tree_[nid].right_nodeid else: nid = tree_[nid].left_nodeid elif data_inst.features.get_data(fid, 0) <= bid: nid = tree_[nid].left_nodeid else: nid = tree_[nid].right_nodeid return nid, 1
def host_assign_an_instance(value, tree_, bin_sparse_points, use_missing, zero_as_missing, dense_format=False): unleaf_state, nodeid = value[1] if tree_[nodeid].is_leaf is True: return nodeid fid = tree_[nodeid].fid bid = tree_[nodeid].bid if not dense_format: if not use_missing: if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid: return 1, tree_[nodeid].left_nodeid else: return 1, tree_[nodeid].right_nodeid else: missing_dir = tree_[nodeid].missing_dir missing_val = False if zero_as_missing: if value[0].features.get_data(fid, None) is None or \ value[0].features.get_data(fid) == NoneType(): missing_val = True elif use_missing and value[0].features.get_data(fid) == NoneType(): missing_val = True if missing_val: if missing_dir == 1: return 1, tree_[nodeid].right_nodeid else: return 1, tree_[nodeid].left_nodeid else: if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid: return 1, tree_[nodeid].left_nodeid else: return 1, tree_[nodeid].right_nodeid else: # this branch is for fast histogram # will get scipy sparse matrix if using fast histogram if not use_missing: sample_feat = value[0].features[0, fid] # value.features is a scipy sparse matrix return (1, tree_[nodeid].left_nodeid) if sample_feat <= bid else (1, tree_[nodeid].right_nodeid) else: missing_dir = tree_[nodeid].missing_dir sample_feat = value[0].features[0, fid] if zero_as_missing: # zero_as_missing and use_missing, 0 and missing value are marked as -1 sample_feat -= 1 # remove offset if sample_feat == -1: return (1, tree_[nodeid].right_nodeid) if missing_dir == 1 else (1, tree_[nodeid].left_nodeid) else: return (1, tree_[nodeid].left_nodeid) if sample_feat <= bid else (1, tree_[nodeid].right_nodeid)
def assign_a_instance(value1, value2, sitename=None, decoder=None, split_maskdict=None, bin_sparse_points=None, use_missing=False, zero_as_missing=False, missing_dir_maskdict=None): unleaf_state, fid, bid, node_sitename, nodeid, left_nodeid, right_nodeid = value1 if node_sitename != sitename: return value1 fid = decoder("feature_idx", fid, split_maskdict=split_maskdict) bid = decoder("feature_val", bid, nodeid, split_maskdict=split_maskdict) if not use_missing: if value2.features.get_data(fid, bin_sparse_points[fid]) <= bid: return unleaf_state, left_nodeid else: return unleaf_state, right_nodeid else: missing_dir = decoder("missing_dir", 1, nodeid, missing_dir_maskdict=missing_dir_maskdict) missing_val = False if zero_as_missing: if value2.features.get_data(fid, None) is None or \ value2.features.get_data(fid) == NoneType(): missing_val = True elif use_missing and value2.features.get_data(fid) == NoneType(): missing_val = True if missing_val: if missing_dir == 1: return unleaf_state, right_nodeid else: return unleaf_state, left_nodeid else: if value2.features.get_data(fid, bin_sparse_points[fid]) <= bid: return unleaf_state, left_nodeid else: return unleaf_state, right_nodeid
def static_candidates_num(instances, select_cols, all_candidates): """ Static number of candidates Parameters ---------- instances: Data generator Original data select_cols: list Indicates columns that need to be operated. all_candidates: dict Each key is col_index and value is a list that contains mode candidates. """ for _, instant in instances: for col_index in select_cols: candidate_dict = all_candidates[col_index] if is_sparse: feature_value = instant.features.get_data( col_index, NoneType()) else: feature_value = instant.features[col_index] if isinstance(feature_value, float): feature_value = round(feature_value, 8) if feature_value in candidate_dict: candidate_dict[feature_value] += 1 # mode_result = {} # for col_index, candidate_dict in all_candidates.items(): # feature_value, nums = sorted(candidate_dict.items(), key=operator.itemgetter(1), reverse=False)[0] # mode_result[col_index] = (feature_value, nums) return all_candidates
def _fill_nan(inst): arr = copy.deepcopy(inst.features) nan_index = np.isnan(arr) arr = arr.astype(np.object) arr[nan_index] = NoneType() inst.features = arr return inst
def assign_a_instance(value, tree_=None, decoder=None, sitename=consts.GUEST, split_maskdict=None, bin_sparse_points=None, use_missing=False, zero_as_missing=False, missing_dir_maskdict=None): unleaf_state, nodeid = value[1] if tree_[nodeid].is_leaf is True: return tree_[nodeid].weight else: if tree_[nodeid].sitename == sitename: fid = decoder("feature_idx", tree_[nodeid].fid, split_maskdict=split_maskdict) bid = decoder("feature_val", tree_[nodeid].bid, nodeid, split_maskdict=split_maskdict) if not use_missing: if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid: return 1, tree_[nodeid].left_nodeid else: return 1, tree_[nodeid].right_nodeid else: missing_dir = decoder("missing_dir", tree_[nodeid].missing_dir, nodeid, missing_dir_maskdict=missing_dir_maskdict) missing_val = False if zero_as_missing: if value[0].features.get_data(fid, None) is None or \ value[0].features.get_data(fid) == NoneType(): missing_val = True elif use_missing and value[0].features.get_data(fid) == NoneType(): missing_val = True if missing_val: if missing_dir == 1: return 1, tree_[nodeid].right_nodeid else: return 1, tree_[nodeid].left_nodeid else: LOGGER.debug("fid is {}, bid is {}, sitename is {}".format(fid, bid, sitename)) if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid: return 1, tree_[nodeid].left_nodeid else: return 1, tree_[nodeid].right_nodeid else: return (1, tree_[nodeid].fid, tree_[nodeid].bid, tree_[nodeid].sitename, nodeid, tree_[nodeid].left_nodeid, tree_[nodeid].right_nodeid)
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) if self.use_missing: binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()]) else: binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
def np_nan_to_nonetype(inst): arr = inst.features index = np.isnan(arr) if index.any(): inst = copy.deepcopy(inst) arr = arr.astype(object) arr[index] = NoneType() inst.features = arr return inst
def _handle_zero_as_missing(inst, feat_num, missing_bin_idx): """ This for use_missing + zero_as_missing case """ sparse_vec = inst.features.sparse_vec arr = np.zeros(feat_num, dtype=np.uint8) + missing_bin_idx for k, v in sparse_vec.items(): if v != NoneType(): arr[k] = v inst.features = arr return inst
def _transform_nan(instance): feature_shape = instance.features.shape[0] new_features = [] for i in range(feature_shape): if instance.features[i] != instance.features[i]: new_features.append(NoneType()) else: new_features.append(instance.features[i]) new_instance = copy.deepcopy(instance) new_instance.features = np.array(new_features) return new_instance
def data_format_transform(row): """ transform data into sparse format """ if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.isnan(row.features[i]): indices.append(i) data.append(NoneType()) elif np.abs(row.features[i]) < consts.FLOAT_ZERO: continue else: indices.append(i) data.append(row.features[i]) new_row = copy.deepcopy(row) new_row.features = SparseVector(indices, data, feature_shape) return new_row else: sparse_vec = row.features.get_sparse_vector() replace_key = [] for key in sparse_vec: if sparse_vec.get(key) == NoneType() or np.isnan( sparse_vec.get(key)): replace_key.append(key) if len(replace_key) == 0: return row else: new_row = copy.deepcopy(row) new_sparse_vec = new_row.features.get_sparse_vector() for key in replace_key: new_sparse_vec[key] = NoneType() return new_row
def federated_binning(self, data_instance): if self.use_missing: binning_result = self.binning_obj.average_run( data_instances=data_instance, bin_num=self.bin_num, abnormal_list=[NoneType()]) else: binning_result = self.binning_obj.average_run( data_instances=data_instance, bin_num=self.bin_num) return self.binning_obj.convert_feature_to_bin(data_instance, binning_result)
def federated_binning(self, ): binning_param = HomoFeatureBinningParam(method=consts.RECURSIVE_QUERY, bin_num=self.bin_num, error=self.binning_error) if self.use_missing: self.binning_obj = recursive_query_binning.Server( binning_param, abnormal_list=[NoneType()]) else: self.binning_obj = recursive_query_binning.Server(binning_param, abnormal_list=[]) self.binning_obj.fit_split_points(None)
def convert_feature_to_bin(self, data_instance, handle_missing_value=False): """ convert bin index to real value """ LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error) if handle_missing_value: self.binning_obj = self.binning_class(param_obj, abnormal_list=[NoneType()],) else: self.binning_obj = self.binning_class(param_obj) self.binning_obj.fit_split_points(data_instance) LOGGER.info("convert feature to bins over") return self.binning_obj.convert_feature_to_bin(data_instance)
def federated_binning(self, data_instance): binning_param = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error) self.binning_obj.bin_param = binning_param if self.use_missing: binning_result = self.binning_obj.average_run( data_instances=data_instance, abnormal_list=[NoneType()]) else: binning_result = self.binning_obj.average_run( data_instances=data_instance, ) return self.binning_obj.convert_feature_to_bin(data_instance, binning_result)
def sparse_to_array(data, feature_sparse_point_array, use_missing, zero_as_missing): new_data = copy.deepcopy(data) new_feature_sparse_point_array = copy.deepcopy(feature_sparse_point_array) for k, v in data.features.get_all_data(): if v == NoneType(): value = -1 else: value = v new_feature_sparse_point_array[k] = value # as most sparse point is bin-0 # when mark it as a missing value (-1), offset it to make it sparse if not use_missing or (use_missing and not zero_as_missing): offset = 0 else: offset = 1 new_data.features = sp.csc_matrix(np.array(new_feature_sparse_point_array) + offset) return new_data
def federated_binning(self, data_instance): binning_param = HomoFeatureBinningParam(method=consts.RECURSIVE_QUERY, bin_num=self.bin_num, error=self.binning_error) if self.use_missing: self.binning_obj = recursive_query_binning.Client( params=binning_param, abnormal_list=[NoneType()], role=self.role) else: self.binning_obj = recursive_query_binning.Client( params=binning_param, role=self.role) self.binning_obj.fit_split_points(data_instance) return self.binning_obj.convert_feature_to_bin(data_instance)
def __init__(self, missing_value_list=None): """ Parameters ---------- missing_value_list: list, the value to be replaced. Default None, if is None, it will be set to list of blank, none, null and na, which regarded as missing filled. If not, it can be outlier replace, and missing_value_list includes the outlier values """ if missing_value_list is None: self.missing_value_list = [ '', 'none', 'null', 'na', 'None', np.nan ] else: self.missing_value_list = missing_value_list self.abnormal_value_list = copy.deepcopy(self.missing_value_list) for i, v in enumerate(self.missing_value_list): if v != v: self.missing_value_list[i] = np.nan self.abnormal_value_list[i] = NoneType() self.support_replace_method = [ 'min', 'max', 'mean', 'median', 'designated' ] self.support_output_format = { 'str': str, 'float': float, 'int': int, 'origin': None } self.support_replace_area = { 'min': 'col', 'max': 'col', 'mean': 'col', 'median': 'col', 'designated': 'col' } self.cols_fit_impute_rate = [] self.cols_transform_impute_rate = [] self.cols_replace_method = [] self.skip_cols = []
def __init__(self): super(PSI, self).__init__() self.model_param = PSIParam() self.max_bin_num = 20 self.tag_id_mapping = {} self.id_tag_mapping = {} self.count1, self.count2 = None, None self.actual_table, self.expect_table = None, None self.data_bin1, self.data_bin2 = None, None self.bin_split_points = None self.bin_sparse_points = None self.psi_rs = None self.total_scores = None self.all_feature_list = None self.dense_missing_val = NoneType() self.binning_error = consts.DEFAULT_RELATIVE_ERROR self.interval_perc1 = None self.interval_perc2 = None self.str_intervals = None self.binning_obj = None
def map_partition_handle(iterable, feat_num=10, max_bin_num=20, is_sparse=False, missing_val=NoneType()): count_bin = np.zeros((feat_num, max_bin_num)) row_idx = np.array([i for i in range(feat_num)]) for k, v in iterable: # last bin is for missing value if is_sparse: feature_dict = v.features.sparse_vec arr = np.zeros( feat_num, dtype=np.int64 ) + max_bin_num - 1 # max_bin_num - 1 is the missing bin val arr[list(feature_dict.keys())] = list(feature_dict.values()) else: arr = v.features arr[arr == missing_val] = max_bin_num - 1 count_bin[row_idx, arr.astype(np.int64)] += 1 return count_bin
def batch_calculate_histogram(kv_iterator, bin_split_points=None, bin_sparse_points=None, valid_features=None, node_map=None, use_missing=False, zero_as_missing=False, with_uuid=False): data_bins = [] node_ids = [] grad = [] hess = [] data_record = 0 # total instance number of this partition # go through iterator to collect g/h feature instances/ node positions for _, value in kv_iterator: data_bin, nodeid_state = value[0] unleaf_state, nodeid = nodeid_state if unleaf_state == 0 or nodeid not in node_map: continue g, h = value[1] # encrypted text in host, plaintext in guest data_bins.append(data_bin) # features node_ids.append(nodeid) # current node position grad.append(g) hess.append(h) data_record += 1 LOGGER.info("begin batch calculate histogram, data count is {}".format( data_record)) node_num = len(node_map) missing_bin = 1 if use_missing else 0 # if the value of a feature is 0, the corresponding bin index will not appear in the sample sparse vector # need to compute correct sparse point g_sum and s_sum by: # (node total sum value) - (node feature total sum value) + (non 0 sparse point sum) # [0, 0, 0] -> g, h, sample count zero_optim = [[[0 for i in range(3)] for j in range(bin_split_points.shape[0])] for k in range(node_num)] zero_opt_node_sum = [[0 for i in range(3)] for j in range(node_num)] node_histograms = FeatureHistogram.generate_histogram_template( node_map, bin_split_points, valid_features, missing_bin) for rid in range(data_record): nid = node_map.get(node_ids[rid]) # node total sum value zero_opt_node_sum[nid][0] += grad[rid] zero_opt_node_sum[nid][1] += hess[rid] zero_opt_node_sum[nid][2] += 1 for fid, value in data_bins[rid].features.get_all_data(): if valid_features is not None and valid_features[fid] is False: continue if use_missing and value == NoneType(): # missing value is set as -1 value = -1 node_histograms[nid][fid][value][0] += grad[rid] node_histograms[nid][fid][value][1] += hess[rid] node_histograms[nid][fid][value][2] += 1 # node feature total sum value zero_optim[nid][fid][0] += grad[rid] zero_optim[nid][fid][1] += hess[rid] zero_optim[nid][fid][2] += 1 for nid in range(node_num): for fid in range(bin_split_points.shape[0]): if valid_features is not None and valid_features[fid] is True: if not use_missing or (use_missing and not zero_as_missing): # add 0 g/h sum to sparse point sparse_point = bin_sparse_points[fid] node_histograms[nid][fid][sparse_point][ 0] += zero_opt_node_sum[nid][0] - zero_optim[nid][ fid][0] node_histograms[nid][fid][sparse_point][ 1] += zero_opt_node_sum[nid][1] - zero_optim[nid][ fid][1] node_histograms[nid][fid][sparse_point][ 2] += zero_opt_node_sum[nid][2] - zero_optim[nid][ fid][2] else: # if 0 is regarded as missing value, add to missing bin node_histograms[nid][fid][-1][0] += zero_opt_node_sum[ nid][0] - zero_optim[nid][fid][0] node_histograms[nid][fid][-1][1] += zero_opt_node_sum[ nid][1] - zero_optim[nid][fid][1] node_histograms[nid][fid][-1][2] += zero_opt_node_sum[ nid][2] - zero_optim[nid][fid][2] ret = FeatureHistogram.generate_histogram_key_value_list( node_histograms, node_map, bin_split_points, with_uuid) return ret
def _batch_calculate_histogram(kv_iterator, bin_split_points=None, bin_sparse_points=None, valid_features=None, node_map=None, use_missing=False, zero_as_missing=False, parent_nid_map=None, sibling_node_id_map=None, stable_reduce=False, mo_dim=None): data_bins = [] node_ids = [] grad = [] hess = [] data_record = 0 # total instance number of this partition partition_key = None # this var is for stable reduce # go through iterator to collect g/h feature instances/ node positions for data_id, value in kv_iterator: if partition_key is None and stable_reduce: # first key of data is used as partition key partition_key = data_id data_bin, nodeid_state = value[0] unleaf_state, nodeid = nodeid_state if unleaf_state == 0 or nodeid not in node_map: continue g, h = value[1] # encrypted text in host, plaintext in guest data_bins.append(data_bin) # features node_ids.append(nodeid) # current node position grad.append(g) hess.append(h) data_record += 1 LOGGER.debug( "begin batch calculate histogram, data count is {}".format( data_record)) node_num = len(node_map) missing_bin = 1 if use_missing else 0 # if the value of a feature is 0, the corresponding bin index will not appear in the sample sparse vector # need to compute correct sparse point g_sum and s_sum by: # (node total sum value) - (node feature total sum value) + (non 0 sparse point sum) # [0, 0, 0] -> g, h, sample count zero_optim = [[[0 for i in range(3)] for j in range(bin_split_points.shape[0])] for k in range(node_num)] zero_opt_node_sum = [[0 for i in range(3)] for j in range(node_num)] node_histograms = FeatureHistogram._generate_histogram_template( node_map, bin_split_points, valid_features, missing_bin, mo_dim=mo_dim) for rid in range(data_record): # node index is the position in the histogram list of a certain node node_idx = node_map.get(node_ids[rid]) # node total sum value zero_opt_node_sum[node_idx][0] += grad[rid] zero_opt_node_sum[node_idx][1] += hess[rid] zero_opt_node_sum[node_idx][2] += 1 for fid, value in data_bins[rid].features.get_all_data(): if valid_features is not None and valid_features[fid] is False: continue if use_missing and value == NoneType(): # missing value is set as -1 value = -1 node_histograms[node_idx][fid][value][0] += grad[rid] node_histograms[node_idx][fid][value][1] += hess[rid] node_histograms[node_idx][fid][value][2] += 1 for nid in range(node_num): # cal feature level g_h incrementally for fid in range(bin_split_points.shape[0]): if valid_features is not None and valid_features[fid] is False: continue for bin_index in range(len(node_histograms[nid][fid])): zero_optim[nid][fid][0] += node_histograms[nid][fid][ bin_index][0] zero_optim[nid][fid][1] += node_histograms[nid][fid][ bin_index][1] zero_optim[nid][fid][2] += node_histograms[nid][fid][ bin_index][2] for node_idx in range(node_num): for fid in range(bin_split_points.shape[0]): if valid_features is not None and valid_features[fid] is True: if not use_missing or (use_missing and not zero_as_missing): # add 0 g/h sum to sparse point sparse_point = bin_sparse_points[fid] node_histograms[node_idx][fid][sparse_point][0] += zero_opt_node_sum[node_idx][0] - \ zero_optim[node_idx][fid][ 0] node_histograms[node_idx][fid][sparse_point][1] += zero_opt_node_sum[node_idx][1] - \ zero_optim[node_idx][fid][ 1] node_histograms[node_idx][fid][sparse_point][2] += zero_opt_node_sum[node_idx][2] - \ zero_optim[node_idx][fid][ 2] else: # if 0 is regarded as missing value, add to missing bin node_histograms[node_idx][fid][-1][0] += zero_opt_node_sum[node_idx][0] - \ zero_optim[node_idx][fid][0] node_histograms[node_idx][fid][-1][1] += zero_opt_node_sum[node_idx][1] - \ zero_optim[node_idx][fid][1] node_histograms[node_idx][fid][-1][2] += zero_opt_node_sum[node_idx][2] - \ zero_optim[node_idx][fid][2] ret = FeatureHistogram._generate_histogram_key_value_list( node_histograms, node_map, bin_split_points, parent_nid_map, sibling_node_id_map, partition_key=partition_key) return ret
def _init_model(self, model: PSIParam): self.max_bin_num = model.max_bin_num self.need_run = model.need_run self.dense_missing_val = NoneType( ) if model.dense_missing_val is None else model.dense_missing_val self.binning_error = model.binning_error
def traverse_tree(predict_state, data_inst, tree_=None, decoder=None, sitename=consts.GUEST, split_maskdict=None, use_missing=None, zero_as_missing=None, missing_dir_maskdict=None, encrypted_weight_dict=None, encrypted_zero_dict=None): nid, tag = predict_state weight_dict = {} #Encrypt(0) # LOGGER.info("cxl:tree_{}".format(tree_)) # LOGGER.info("CXL:len(tree_)".format(len(tree_))) for i in range(len(tree_)): # LOGGER.info("cxl:i{}".format(i)) if tree_[i].is_leaf is True: weight_dict[i] = encrypted_zero_dict[i] node_queue = [nid] while len(node_queue) != 0: nid = node_queue[0] node_queue.remove(nid) #leaf_code weight if tree_[nid].is_leaf is True: weight_dict[nid] = encrypted_weight_dict[nid] else: if tree_[nid].sitename == sitename: fid = decoder("feature_idx", tree_[nid].fid, split_maskdict=split_maskdict) bid = decoder("feature_val", tree_[nid].bid, nid, split_maskdict=split_maskdict) if use_missing: missing_dir = decoder( "missing_dir", 1, nid, missing_dir_maskdict=missing_dir_maskdict) else: missing_dir = 1 if use_missing and zero_as_missing: missing_dir = decoder( "missing_dir", 1, nid, missing_dir_maskdict=missing_dir_maskdict) if data_inst.features.get_data(fid) == NoneType( ) or data_inst.features.get_data(fid, None) is None: if missing_dir == 1: nid = tree_[nid].right_nodeid else: nid = tree_[nid].left_nodeid elif data_inst.features.get_data(fid) <= bid: nid = tree_[nid].left_nodeid else: nid = tree_[nid].right_nodeid elif data_inst.features.get_data(fid) == NoneType(): if missing_dir == 1: nid = tree_[nid].right_nodeid else: nid = tree_[nid].left_nodeid elif data_inst.features.get_data(fid, 0) <= bid: nid = tree_[nid].left_nodeid else: nid = tree_[nid].right_nodeid node_queue.append(nid) else: node_queue.append(tree_[nid].left_nodeid) node_queue.append(tree_[nid].right_nodeid) return weight_dict
def fit(self, expect_table, actual_table): LOGGER.info('start psi computing') header1 = expect_table.schema['header'] header2 = actual_table.schema['header'] if not set(header1) == set(header2): raise ValueError( 'table header must be the same while computing psi values') # baseline table should not contain empty columns abnormal_detection.empty_column_detection(expect_table) self.all_feature_list = header1 # make sure no duplicate features self.all_feature_list = self.check_duplicates(self.all_feature_list) # kv bi-directional mapping self.tag_id_mapping = { v: k for k, v in enumerate(self.all_feature_list) } self.id_tag_mapping = { k: v for k, v in enumerate(self.all_feature_list) } if not self.is_sparse( expect_table): # convert missing value: nan to NoneType expect_table = self.convert_missing_val(expect_table) if not self.is_sparse( actual_table): # convert missing value: nan to NoneType actual_table = self.convert_missing_val(actual_table) if not (self.check_table_content(expect_table) and self.check_table_content(actual_table)): raise ValueError( 'contents of input table must be instances of class "Instance"' ) param = FeatureBinningParam(method=consts.QUANTILE, bin_num=self.max_bin_num, local_only=True, error=self.binning_error) binning_obj = QuantileBinning(params=param, abnormal_list=[NoneType()], allow_duplicate=False) binning_obj.fit_split_points(expect_table) data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin( expect_table) LOGGER.debug('bin split points is {}, shape is {}'.format( bin_split_points, bin_split_points.shape)) self.binning_obj = binning_obj self.data_bin1 = data_bin self.bin_split_points = bin_split_points self.bin_sparse_points = bin_sparse_points LOGGER.debug('expect table binning done') count_func1 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin1)) map_rs1 = self.data_bin1.applyPartitions(count_func1) count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce)) data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin( actual_table) self.data_bin2 = data_bin2 LOGGER.debug('actual table binning done') count_func2 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin2)) map_rs2 = self.data_bin2.applyPartitions(count_func2) count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce)) self.count1, self.count2 = count1, count2 LOGGER.info('psi counting done') # compute psi from counting result psi_result = psi_computer(count1, count2, expect_table.count(), actual_table.count()) self.psi_rs = psi_result # get total psi score of features total_scores = {} for idx, rs in enumerate(self.psi_rs): feat_name = self.id_tag_mapping[idx] total_scores[feat_name] = rs['total_psi'] self.total_scores = total_scores # id-feature mapping convert, str interval computation self.str_intervals = self.get_string_interval( bin_split_points, self.id_tag_mapping, missing_bin_idx=self.max_bin_num) self.interval_perc1 = self.count_dict_to_percentage( copy.deepcopy(count1), expect_table.count()) self.interval_perc2 = self.count_dict_to_percentage( copy.deepcopy(count2), actual_table.count()) self.set_summary(self.generate_summary()) LOGGER.info('psi computation done')