def _load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) # self._parse_need_run(model_dict, MODEL_META_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) # model_meta.cols = list(model_meta.cols) # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols) self.cols = list(map(int, model_meta.cols)) bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(model_meta, self.party_name) else: self.binning_obj = BucketBinning(model_meta, self.party_name) binning_result_obj = dict(model_param.binning_result.binning_result) host_params = dict(model_param.host_results) self.binning_result = {} self.host_results = {} for col_name, iv_attr_obj in binning_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr) self.binning_result[col_name] = iv_attr # self.cols.append(col_name) for host_name, host_result_obj in host_params.items(): host_result_obj = dict(host_result_obj.binning_result) for col_name, iv_attr_obj in host_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) host_result_obj[col_name] = iv_attr self.host_results[host_name] = host_result_obj
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.role == consts.HOST: if self.transform_type == "woe": raise ValueError( "Host party do not support woe transform now.") if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) self.iv_calculator = IvCalculator( self.model_param.adjustment_factor, role=self.role, party_id=self.component_properties.local_partyid)
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.role == consts.HOST: if self.transform_type == "woe": raise ValueError( "Host party do not support woe transform now.") if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format( self.role, self.component_properties)) self.binning_obj.set_role_party( self.role, self.component_properties.local_partyid)
def load_model(self, model_dict): model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] for host_pb in model_param.host_results: host_bin_obj = BaseBinning() host_bin_obj.bin_results.reconstruct(host_pb) self.host_results.append(host_bin_obj)
def _init_model(self, params): self.model_param = params self.cols_index = params.cols if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param, self.party_name) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param, self.party_name) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method))
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) split_point = list(split_points.values())[0] for kth, s_p in enumerate(split_point): expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1) self.assertEqual(s_p, expect_s_p) iv_attrs = bucket_bin.cal_local_iv(self.table) for col_name, iv_attr in iv_attrs.items(): print('col_name: {}, iv: {}, woe_array: {}'.format( col_name, iv_attr.iv, iv_attr.woe_array))
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, bin_indexes=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) split_point = list(split_points.values())[0] for kth, s_p in enumerate(split_point): expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1) self.assertEqual(s_p, expect_s_p) bucket_bin.cal_local_iv(self.table) for col_name, iv_attr in bucket_bin.bin_results.all_cols_results.items( ): # print('col_name: {}, iv: {}, woe_array: {}'.format(col_name, iv_attr.iv, iv_attr.woe_array)) assert abs(iv_attr.iv - 0.00364386529386804) < 1e-6
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties)) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
def _init_binning_obj(self): if self.bin_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.bin_param, self.party_name) elif self.bin_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.bin_param, self.party_name) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format(self.bin_param.method))
def init_bucket(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) init_bucket_param = copy.deepcopy(self.params) init_bucket_param.bin_num = self.optimal_param.init_bin_nums if self.optimal_param.init_bucket_method == consts.QUANTILE: init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False) else: init_binning_obj = BucketBinning(params=init_bucket_param) init_binning_obj.set_bin_inner_param(self.bin_inner_param) init_split_points = init_binning_obj.fit_split_points(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) bucket_dict = dict() for col_name, sps in init_split_points.items(): bucket_list = [] for idx, sp in enumerate(sps): bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) if idx == 0: bucket.left_bound = -math.inf bucket.set_left_neighbor(None) else: bucket.left_bound = sps[idx - 1] bucket.event_total = self.event_total bucket.non_event_total = self.non_event_total bucket_list.append(bucket) bucket_list[-1].set_right_neighbor(None) bucket_dict[col_name] = bucket_list # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, " # f"length of list: {len(bucket_list)}") convert_func = functools.partial( self.convert_data_to_bucket, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) bucket_table = data_instances.mapReducePartitions( convert_func, self.merge_bucket_list) # bucket_table = dict(bucket_table.collect()) # for k, v in bucket_table.items(): # LOGGER.debug(f"[feature] {k}, length of list: {len(v)}") # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = [(k, v) for k, v in bucket_table.items()] # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions) return bucket_table
class BaseFeatureBinning(ModelBase): """ Do binning method through guest and host """ def __init__(self): super(BaseFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.binning_obj: BaseBinning = None self.header = None self.header_anonymous = None self.schema = None self.host_results = [] self.transform_type = None self.model_param = FeatureBinningParam() self.bin_inner_param = BinInnerParam() def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format( self.role, self.component_properties)) self.binning_obj.set_role_party( self.role, self.component_properties.local_partyid) @staticmethod def data_format_transform(row): """ transform data into sparse format """ if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.isnan(row.features[i]): indices.append(i) data.append(NoneType()) elif np.abs(row.features[i]) < consts.FLOAT_ZERO: continue else: indices.append(i) data.append(row.features[i]) new_row = copy.deepcopy(row) new_row.features = SparseVector(indices, data, feature_shape) return new_row else: sparse_vec = row.features.get_sparse_vector() replace_key = [] for key in sparse_vec: if sparse_vec.get(key) == NoneType() or np.isnan( sparse_vec.get(key)): replace_key.append(key) if len(replace_key) == 0: return row else: new_row = copy.deepcopy(row) new_sparse_vec = new_row.features.get_sparse_vector() for key in replace_key: new_sparse_vec[key] = NoneType() return new_row def _setup_bin_inner_param(self, data_instances, params): if self.schema is not None: return self.header = get_header(data_instances) LOGGER.debug("_setup_bin_inner_param, get header length: {}".format( len(self.header))) self.schema = data_instances.schema self.bin_inner_param.set_header(self.header) if params.bin_indexes == -1: self.bin_inner_param.set_bin_all() else: self.bin_inner_param.add_bin_indexes(params.bin_indexes) self.bin_inner_param.add_bin_names(params.bin_names) self.bin_inner_param.add_category_indexes(params.category_indexes) self.bin_inner_param.add_category_names(params.category_names) if params.transform_param.transform_cols == -1: self.bin_inner_param.set_transform_all() else: self.bin_inner_param.add_transform_bin_indexes( params.transform_param.transform_cols) self.bin_inner_param.add_transform_bin_names( params.transform_param.transform_names) self.binning_obj.set_bin_inner_param(self.bin_inner_param) @assert_io_num_rows_equal @assert_schema_consistent def transform(self, data_instances): self._setup_bin_inner_param(data_instances, self.model_param) data_instances = self.binning_obj.transform(data_instances, self.transform_type) self.set_schema(data_instances) self.data_output = data_instances return data_instances def _get_meta(self): # col_list = [str(x) for x in self.cols] transform_param = feature_binning_meta_pb2.TransformMeta( transform_cols=self.bin_inner_param.transform_bin_indexes, transform_type=self.model_param.transform_param.transform_type) meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta( method=self.model_param.method, compress_thres=self.model_param.compress_thres, head_size=self.model_param.head_size, error=self.model_param.error, bin_num=self.model_param.bin_num, cols=self.bin_inner_param.bin_names, adjustment_factor=self.model_param.adjustment_factor, local_only=self.model_param.local_only, need_run=self.need_run, transform_param=transform_param, skip_static=self.model_param.skip_static) return meta_protobuf_obj def _get_param(self): binning_result_obj = self.binning_obj.bin_results.generated_pb() # binning_result_obj = self.bin_results.generated_pb() host_results = [ x.bin_results.generated_pb() for x in self.host_results ] result_obj = feature_binning_param_pb2. \ FeatureBinningParam(binning_result=binning_result_obj, host_results=host_results, header=self.header, header_anonymous=self.header_anonymous, model_name=consts.BINNING_MODEL) return result_obj def load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes( list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) self.binning_obj.set_role_party( self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] for host_pb in model_param.host_results: host_bin_obj = BaseBinning() host_bin_obj.bin_results.reconstruct(host_pb) self.host_results.append(host_bin_obj) def export_model(self): if self.model_output is not None: return self.model_output meta_obj = self._get_meta() param_obj = self._get_param() result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj} self.model_output = result return result def save_data(self): return self.data_output def set_schema(self, data_instance): self.schema['header'] = self.header data_instance.schema = self.schema LOGGER.debug( "After Binning, when setting schema, schema is : {}".format( data_instance.schema)) def _abnormal_detection(self, data_instances): """ Make sure input data_instances is valid. """ abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) self.check_schema_content(data_instances.schema)
class BaseHeteroFeatureBinning(ModelBase): """ Do binning method through guest and host Attributes ---------- header : list record headers of input table has_synchronized : bool Record whether the encryption information has been synchronized or not. flowid : str Use in cross validation binning_result: dict Record binning result of guest party. The format is {'col_name': 'iv_attr', ... } host_results: dict This attribute uses to record host results. For future version which may record multiple host results, the format is dict of dict. e.g. host_results = {'host1': {'x1': iv1, 'x2: iv2} 'host2': ... } """ def __init__(self): super(BaseHeteroFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.cols = None self.cols_dict = {} self.binning_obj = None self.header = [] self.schema = {} self.has_synchronized = False self.flowid = '' self.binning_result = {} # dict of iv_attr self.host_results = {} # dict of host results self.party_name = 'Base' self.model_param = FeatureBinningParam() def _init_model(self, params): self.model_param = params self.cols_index = params.cols if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param, self.party_name) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param, self.party_name) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) def transform(self, data_instances): self._parse_cols(data_instances) transform_cols_idx = self.model_param.transform_param.transform_cols transform_type = self.model_param.transform_param.transform_type data_instances = self.binning_obj.transform(data_instances, transform_cols_idx, transform_type) self.set_schema(data_instances) self.data_output = data_instances return data_instances def _get_meta(self): col_list = [str(x) for x in self.cols] meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta( method=self.model_param.method, compress_thres=self.model_param.compress_thres, head_size=self.model_param.head_size, error=self.model_param.error, bin_num=self.model_param.bin_num, cols=col_list, adjustment_factor=self.model_param.adjustment_factor, local_only=self.model_param.local_only, need_run=self.need_run) return meta_protobuf_obj def _get_param(self): binning_result = self.binning_result host_results = self.host_results iv_attrs = {} for col_name, iv_attr in binning_result.items(): iv_result = iv_attr.result_dict() iv_object = feature_binning_param_pb2.IVParam(**iv_result) iv_attrs[col_name] = iv_object binning_result_obj = feature_binning_param_pb2.FeatureBinningResult( binning_result=iv_attrs) final_host_results = {} for host_id, this_host_results in host_results.items(): host_result = {} for host_col_idx, iv_attr in this_host_results.items(): iv_result = iv_attr.result_dict() iv_object = feature_binning_param_pb2.IVParam(**iv_result) host_result[str(host_col_idx)] = iv_object final_host_results[ host_id] = feature_binning_param_pb2.FeatureBinningResult( binning_result=host_result) result_obj = feature_binning_param_pb2.FeatureBinningParam( binning_result=binning_result_obj, host_results=final_host_results) # json_result = json_format.MessageToJson(result_obj) # LOGGER.debug("json_result: {}".format(json_result)) return result_obj def _load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) # self._parse_need_run(model_dict, MODEL_META_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) # model_meta.cols = list(model_meta.cols) # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols) self.cols = list(map(int, model_meta.cols)) bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(model_meta, self.party_name) else: self.binning_obj = BucketBinning(model_meta, self.party_name) binning_result_obj = dict(model_param.binning_result.binning_result) host_params = dict(model_param.host_results) self.binning_result = {} self.host_results = {} for col_name, iv_attr_obj in binning_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr) self.binning_result[col_name] = iv_attr # self.cols.append(col_name) for host_name, host_result_obj in host_params.items(): host_result_obj = dict(host_result_obj.binning_result) for col_name, iv_attr_obj in host_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) host_result_obj[col_name] = iv_attr self.host_results[host_name] = host_result_obj # LOGGER.debug("In feature binning load model, self.binning_result: {}, cols: {}, host_results: {}".format( # self.binning_result, self.cols, self.host_results # )) def export_model(self): if self.model_output is not None: return self.model_output meta_obj = self._get_meta() param_obj = self._get_param() result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj} self.model_output = result return result def save_data(self): return self.data_output def _parse_cols(self, data_instances): if self.header is not None and len(self.header) != 0: return LOGGER.debug("Before Binning, schema is : {}".format( data_instances.schema)) header = get_header(data_instances) self.schema = data_instances.schema self.header = header # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header)) if self.cols_index == -1: if header is None: raise RuntimeError( 'Cannot get feature header, please check input data') self.cols = [i for i in range(len(header))] else: self.cols = self.cols_index self.cols_dict = {} for col in self.cols: col_name = header[col] self.cols_dict[col_name] = col def set_schema(self, data_instance): self.schema['header'] = self.header data_instance.schema = self.schema LOGGER.debug( "After Binning, when setting schema, schema is : {}".format( data_instance.schema)) def _abnormal_detection(self, data_instances): """ Make sure input data_instances is valid. """ abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances)
class BaseFeatureBinning(ModelBase): """ Do binning method through guest and host """ def __init__(self): super(BaseFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.binning_obj: BaseBinning = None self.header = None self.header_anonymous = None self.schema = None self.host_results = [] self.transform_type = None self.model_param = FeatureBinningParam() self.bin_inner_param = BinInnerParam() self.bin_result = MultiClassBinResult(labels=[0, 1]) self.has_missing_value = False self.labels = [] def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.role == consts.HOST: if self.transform_type == "woe": raise ValueError( "Host party do not support woe transform now.") if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) self.iv_calculator = IvCalculator( self.model_param.adjustment_factor, role=self.role, party_id=self.component_properties.local_partyid) # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) @staticmethod def data_format_transform(row): """ transform data into sparse format """ if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.isnan(row.features[i]): indices.append(i) data.append(NoneType()) elif np.abs(row.features[i]) < consts.FLOAT_ZERO: continue else: indices.append(i) data.append(row.features[i]) new_row = copy.deepcopy(row) new_row.features = SparseVector(indices, data, feature_shape) return new_row else: sparse_vec = row.features.get_sparse_vector() replace_key = [] for key in sparse_vec: if sparse_vec.get(key) == NoneType() or np.isnan( sparse_vec.get(key)): replace_key.append(key) if len(replace_key) == 0: return row else: new_row = copy.deepcopy(row) new_sparse_vec = new_row.features.get_sparse_vector() for key in replace_key: new_sparse_vec[key] = NoneType() return new_row def _setup_bin_inner_param(self, data_instances, params): if self.schema is not None: return self.header = get_header(data_instances) LOGGER.debug("_setup_bin_inner_param, get header length: {}".format( len(self.header))) self.schema = data_instances.schema self.bin_inner_param.set_header(self.header) if params.bin_indexes == -1: self.bin_inner_param.set_bin_all() else: self.bin_inner_param.add_bin_indexes(params.bin_indexes) self.bin_inner_param.add_bin_names(params.bin_names) self.bin_inner_param.add_category_indexes(params.category_indexes) self.bin_inner_param.add_category_names(params.category_names) if params.transform_param.transform_cols == -1: self.bin_inner_param.set_transform_all() else: self.bin_inner_param.add_transform_bin_indexes( params.transform_param.transform_cols) self.bin_inner_param.add_transform_bin_names( params.transform_param.transform_names) self.binning_obj.set_bin_inner_param(self.bin_inner_param) @assert_io_num_rows_equal @assert_schema_consistent def transform(self, data_instances): self._setup_bin_inner_param(data_instances, self.model_param) if self.transform_type != "woe": data_instances = self.binning_obj.transform( data_instances, self.transform_type) elif self.role == consts.HOST: raise ValueError( "Woe transform is not available for host parties.") else: data_instances = self.iv_calculator.woe_transformer( data_instances, self.bin_inner_param, self.bin_result) self.set_schema(data_instances) self.data_output = data_instances return data_instances def _get_meta(self): # col_list = [str(x) for x in self.cols] transform_param = feature_binning_meta_pb2.TransformMeta( transform_cols=self.bin_inner_param.transform_bin_indexes, transform_type=self.model_param.transform_param.transform_type) meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta( method=self.model_param.method, compress_thres=self.model_param.compress_thres, head_size=self.model_param.head_size, error=self.model_param.error, bin_num=self.model_param.bin_num, cols=self.bin_inner_param.bin_names, adjustment_factor=self.model_param.adjustment_factor, local_only=self.model_param.local_only, need_run=self.need_run, transform_param=transform_param, skip_static=self.model_param.skip_static) return meta_protobuf_obj def _get_param(self): split_points_result = self.binning_obj.bin_results.split_results multi_class_result = self.bin_result.generated_pb_list( split_points_result) # LOGGER.debug(f"split_points_result: {split_points_result}") host_multi_class_result = [] host_single_results = [] for host_res in self.host_results: host_multi_class_result.extend(host_res.generated_pb_list()) host_single_results.append(host_res.bin_results[0].generated_pb()) has_host_result = True if len(host_multi_class_result) else False multi_pb = feature_binning_param_pb2.MultiClassResult( results=multi_class_result, labels=[str(x) for x in self.labels], host_results=host_multi_class_result, host_party_ids=[ str(x) for x in self.component_properties.host_party_idlist ], has_host_result=has_host_result) result_obj = feature_binning_param_pb2. \ FeatureBinningParam(binning_result=multi_class_result[0], host_results=host_single_results, header=self.header, header_anonymous=self.header_anonymous, model_name=consts.BINNING_MODEL, multi_class_result=multi_pb) return result_obj def load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() multi_class_result = model_param.multi_class_result self.labels = list(multi_class_result.labels) # if not self.labels: # self.labels = [0, 1] if self.labels: self.bin_result = MultiClassBinResult.reconstruct( list(multi_class_result.results), self.labels) assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes( list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) elif bin_method == consts.OPTIMAL: self.binning_obj = OptimalBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) split_results = dict(model_param.binning_result.binning_result) for col_name, sr_pb in split_results.items(): split_points = list(sr_pb.split_points) self.binning_obj.bin_results.put_col_split_points( col_name, split_points) # self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] host_pbs = list(model_param.multi_class_result.host_results) if len(host_pbs): if len(self.labels) == 2: for host_pb in host_pbs: self.host_results.append( MultiClassBinResult.reconstruct(host_pb, self.labels)) else: assert len(host_pbs) % len(self.labels) == 0 i = 0 while i < len(host_pbs): this_pbs = host_pbs[i:i + len(self.labels)] self.host_results.append( MultiClassBinResult.reconstruct(this_pbs, self.labels)) i += len(self.labels) if list(model_param.header_anonymous): self.header_anonymous = list(model_param.header_anonymous) def export_model(self): if self.model_output is not None: return self.model_output meta_obj = self._get_meta() param_obj = self._get_param() result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj} self.model_output = result return result def save_data(self): return self.data_output def set_schema(self, data_instance): self.schema['header'] = self.header data_instance.schema = self.schema # LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema)) def _abnormal_detection(self, data_instances): """ Make sure input data_instances is valid. """ abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) self.check_schema_content(data_instances.schema)
def load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() multi_class_result = model_param.multi_class_result self.labels = list(multi_class_result.labels) # if not self.labels: # self.labels = [0, 1] if self.labels: self.bin_result = MultiClassBinResult.reconstruct( list(multi_class_result.results), self.labels) assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes( list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) elif bin_method == consts.OPTIMAL: self.binning_obj = OptimalBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) split_results = dict(model_param.binning_result.binning_result) for col_name, sr_pb in split_results.items(): split_points = list(sr_pb.split_points) self.binning_obj.bin_results.put_col_split_points( col_name, split_points) # self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] host_pbs = list(model_param.multi_class_result.host_results) if len(host_pbs): if len(self.labels) == 2: for host_pb in host_pbs: self.host_results.append( MultiClassBinResult.reconstruct(host_pb, self.labels)) else: assert len(host_pbs) % len(self.labels) == 0 i = 0 while i < len(host_pbs): this_pbs = host_pbs[i:i + len(self.labels)] self.host_results.append( MultiClassBinResult.reconstruct(this_pbs, self.labels)) i += len(self.labels) if list(model_param.header_anonymous): self.header_anonymous = list(model_param.header_anonymous)
class BaseHeteroFeatureBinning(ModelBase): """ Do binning method through guest and host """ def __init__(self): super(BaseHeteroFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.binning_obj = None self.header = None self.schema = None self.host_results = [] self.transform_type = None self.model_param = FeatureBinningParam() self.bin_inner_param = BinInnerParam() def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties)) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) def _setup_bin_inner_param(self, data_instances, params: FeatureBinningParam): if self.schema is not None: return self.header = get_header(data_instances) self.schema = data_instances.schema self.bin_inner_param.set_header(self.header) if params.bin_indexes == -1: self.bin_inner_param.set_bin_all() else: self.bin_inner_param.add_bin_indexes(params.bin_indexes) self.bin_inner_param.add_bin_names(params.bin_names) self.bin_inner_param.add_category_indexes(params.category_indexes) self.bin_inner_param.add_category_names(params.category_names) if params.transform_param.transform_cols == -1: self.bin_inner_param.set_transform_all() else: self.bin_inner_param.add_transform_bin_indexes(params.transform_param.transform_cols) self.bin_inner_param.add_transform_bin_names(params.transform_param.transform_names) self.binning_obj.set_bin_inner_param(self.bin_inner_param) def transform(self, data_instances): self._setup_bin_inner_param(data_instances, self.model_param) data_instances = self.binning_obj.transform(data_instances, self.transform_type) self.set_schema(data_instances) self.data_output = data_instances return data_instances def _get_meta(self): # col_list = [str(x) for x in self.cols] transform_param = feature_binning_meta_pb2.TransformMeta( transform_cols=self.bin_inner_param.transform_bin_indexes, transform_type=self.model_param.transform_param.transform_type ) meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta( method=self.model_param.method, compress_thres=self.model_param.compress_thres, head_size=self.model_param.head_size, error=self.model_param.error, bin_num=self.model_param.bin_num, cols=self.bin_inner_param.bin_names, adjustment_factor=self.model_param.adjustment_factor, local_only=self.model_param.local_only, need_run=self.need_run, transform_param=transform_param ) return meta_protobuf_obj def _get_param(self): binning_result_obj = self.binning_obj.bin_results.generated_pb() host_results = [x.bin_results.generated_pb() for x in self.host_results] result_obj = feature_binning_param_pb2.FeatureBinningParam(binning_result=binning_result_obj, host_results=host_results, header=self.header) # json_result = json_format.MessageToJson(result_obj) # LOGGER.debug("json_result: {}".format(json_result)) return result_obj def load_model(self, model_dict): model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] for host_pb in model_param.host_results: host_bin_obj = HostBaseBinning() host_bin_obj.bin_results.reconstruct(host_pb) self.host_results.append(host_bin_obj) def export_model(self): if self.model_output is not None: return self.model_output meta_obj = self._get_meta() param_obj = self._get_param() result = { MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj } self.model_output = result return result def save_data(self): return self.data_output def set_schema(self, data_instance): self.schema['header'] = self.header data_instance.schema = self.schema LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema)) def _abnormal_detection(self, data_instances): """ Make sure input data_instances is valid. """ abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances)
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) print(split_points)
def init_bucket(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) init_bucket_param = copy.deepcopy(self.params) init_bucket_param.bin_num = self.optimal_param.init_bin_nums if self.optimal_param.init_bucket_method == consts.QUANTILE: init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False) else: init_binning_obj = BucketBinning(params=init_bucket_param) init_binning_obj.set_bin_inner_param(self.bin_inner_param) init_split_points = init_binning_obj.fit_split_points(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) bucket_dict = dict() for col_name, sps in init_split_points.items(): # bucket_list = [bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) # for idx, sp in enumerate(sps)] bucket_list = [] for idx, sp in enumerate(sps): bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) if idx == 0: bucket.left_bound = -math.inf bucket.set_left_neighbor(None) else: bucket.left_bound = sps[idx - 1] bucket.event_total = self.event_total bucket.non_event_total = self.non_event_total bucket_list.append(bucket) bucket_list[-1].set_right_neighbor(None) bucket_dict[col_name] = bucket_list LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, " f"length of list: {len(bucket_list)}") # bucket_table = data_instances.mapPartitions2(convert_func) # bucket_table = bucket_table.reduce(self.merge_bucket_list, key_func=lambda key: key[1]) from fate_arch.common.versions import get_eggroll_version version = get_eggroll_version() if version.startswith('2.0'): convert_func = functools.partial(self.convert_data_to_bucket_old, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) summary_dict = data_instances.mapPartitions(convert_func, use_previous_behavior=False) # summary_dict = summary_dict.reduce(self.copy_merge, key_func=lambda key: key[1]) from federatedml.util.reduce_by_key import reduce bucket_table = reduce(summary_dict, self.merge_bucket_list, key_func=lambda key: key[1]) elif version.startswith('2.2'): convert_func = functools.partial(self.convert_data_to_bucket, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) bucket_table = data_instances.mapReducePartitions(convert_func, self.merge_bucket_list) bucket_table = dict(bucket_table.collect()) else: raise RuntimeError(f"Cannot recognized eggroll version: {version}") for k, v in bucket_table.items(): LOGGER.debug(f"[feature] {k}, length of list: {len(v)}") LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) bucket_table = [(k, v) for k, v in bucket_table.items()] LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions) return bucket_table