def __get_min_max_value(self, data): min_value = None max_value = None summary_obj = MultivariateStatisticalSummary(data, -1) if self.feat_upper is not None: max_value = self.feat_upper if self.feat_lower is not None: min_value = self.feat_lower if min_value is None and max_value is not None: min_value_list = summary_obj.get_min() if isinstance(max_value, Iterable): if len(list(max_value)) != len(min_value_list): raise ValueError( "Size of feat_upper is not equal to column of data, {} != {}" .format(len(list(max_value)), len(min_value_list))) max_value_list = max_value else: max_value_list = [max_value for v in min_value_list] elif min_value is not None and max_value is None: max_value_list = summary_obj.get_max() if isinstance(min_value, Iterable): if len(list(min_value)) != len(max_value_list): raise ValueError( "Size of feat_lower is not equal to column of data, {} != {}" .format(len(list(max_value)), len(max_value_list))) min_value_list = min_value else: min_value_list = [min_value for v in max_value_list] elif min_value is None and max_value is None: min_value_list = summary_obj.get_min() max_value_list = summary_obj.get_max() else: shape = None if isinstance(max_value, Iterable): max_value_list = max_value else: shape = data_overview.get_features_shape(data) max_value_list = [max_value for _ in range(shape)] if isinstance(min_value, Iterable): min_value_list = min_value else: if not shape: shape = data_overview.get_features_shape(data) min_value_list = [min_value for _ in range(shape)] if len(list(max_value_list)) != len(min_value_list): raise ValueError( "Size of feat_upper is not equal to column of data, {} != {}" .format(len(list(max_value)), len(min_value_list))) return min_value_list, max_value_list
def _parse_cols(self, data_instances): if self.cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') self.cols = [i for i in range(features_shape)]
def cal_local_iv(self, data_instances, cols, split_points=None, label_table=None): if cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') cols = [i for i in range(features_shape)] if split_points is None: split_points = self.binning(data_instances, cols=cols) data_bin_table = self.transform(data_instances, split_points, cols) if label_table is None: label_table = data_instances.mapValues(lambda x: x.label) event_count_table = label_table.mapValues(lambda x: (x, 1 - x)) data_bin_with_label = data_bin_table.join(event_count_table, lambda x, y: (x, y)) f = functools.partial(self.add_label_in_partition, total_bin=self.bin_num, cols=cols) result_sum = data_bin_with_label.mapPartitions(f) result_counts = result_sum.reduce(self.aggregate_partition_label) iv_attrs = self.cal_iv_woe(result_counts, self.params.adjustment_factor, split_points=split_points) return iv_attrs
def fit(self, data): if not self.with_mean and not self.with_std: shape = data_overview.get_features_shape(data) mean = [0 for _ in range(shape)] std = [1 for _ in range(shape)] return data, mean, std else: summary_obj = MultivariateStatisticalSummary(data, -1) mean = None std = None if self.with_mean: mean = summary_obj.get_mean() if self.with_std: std = summary_obj.get_std_variance() if not mean and std: mean = [0 for value in std] elif mean and not std: std = [1 for value in mean] if not mean or not std: raise ValueError("mean or std is None") f = functools.partial(self.__scale, mean=mean, std=std) data = data.mapValues(f) return data, mean, std
def _init_model_variables(self, data_instances): model_shape = data_overview.get_features_shape(data_instances) LOGGER.info("Initialized model shape is {}".format(model_shape)) model_weights = self.initializer.init_model(model_shape, init_params=self.init_param_obj, data_instance=data_instances) return model_weights
def filter(self, data_instances): if self.select_cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') self.select_cols = [i for i in range(features_shape)] self.left_cols = self.select_cols.copy() for method in self.filter_methods: self.filter_one_method(data_instances, method) return self.left_cols
def initialize(self, data_instances): """ 对w和偏置b进行初始化 """ data_shape = data_overview.get_features_shape(data_instances) LOGGER.info("除去偏置b,数据的维度属性是:{}".format(data_shape)) # 将偏置b也加进来 if isinstance(data_shape, int): data_shape += 1 # 初始化模型参数 self.w = np.random.rand(data_shape) LOGGER.info("初始化模型参数self.w是:{}".format(self.w))
def _transfer_data(self, data_instances): if self.left_cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') self.left_cols = [i for i in range(features_shape)] f = functools.partial(self.select_cols, left_cols=self.left_cols) new_data = data_instances.mapValues(f) self._reset_header() return new_data
def __init__(self, data_instances, select_cols): self.finish_fit = False self.summary_statistics = [] self.median = None self.data_instances = data_instances if select_cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') select_cols = [i for i in range(features_shape)] self.select_cols = select_cols
def transform(self, data_instances, split_points=None, cols=-1): """ Apply the binning method Parameters ---------- data_instances : DTable The input data split_points : list. Each row represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = [[0.1, 0.2, 0.3, 0.4 ...], # The first feature [1, 2, 3, 4, ...], # The second feature ...] # Other features cols : int or list of int Specify which column(s) need to apply binning. -1 means do binning for all columns. Returns ------- data_bin_table : DTable. The element in each row represent for the corresponding bin number this feature belongs to. e.g. for each row, it could be: (1, 5, 2, 6, 0, ...) # Each number represent for the bin number it belongs to. The order is the # same as the order of cols. """ if cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') cols = [i for i in range(features_shape)] if isinstance(cols, int): cols = [cols] assert len(split_points) == len(cols) if split_points is None: split_points = self.binning(data_instances, cols) f = functools.partial(self.bin_data, split_points=split_points, cols=cols) data_bin_table = data_instances.mapValues(f) return data_bin_table
def client_sync_data_info(self, data): n, j = data.count(), data_overview.get_features_shape(data) self.n_count = n if self.role == consts.HOST: self.transfer_variable.host_data_info.remote((n, j), role=consts.ARBITER, idx=0) self.transfer_variable.host_data_info.remote((n, j), role=consts.GUEST, idx=0) j_host = j n_guest, j_guest = self.transfer_variable.guest_data_info.get(idx=0) else: self.transfer_variable.guest_data_info.remote((n, j), role=consts.ARBITER, idx=0) self.transfer_variable.guest_data_info.remote((n, j), role=consts.HOST, idx=0) j_guest = j n_host, j_host = self.transfer_variable.host_data_info.get(idx=0) return j_host, j_guest
def __init_model(self, data_instances): model_shape = data_overview.get_features_shape(data_instances) w = self.initializer.init_model(model_shape, init_params=self.init_param_obj) w = self.encrypt_operator.encrypt_list(w) w = np.array(w) if self.fit_intercept: self.coef_ = w[:-1] self.intercept_ = w[-1] else: self.coef_ = w self.intercept_ = 0 return w
def _init_model_variables(self, data_instances): model_shape = data_overview.get_features_shape(data_instances) LOGGER.info("Initialized model shape is {}".format(model_shape)) fit_intercept = False if self.init_param_obj.fit_intercept: fit_intercept = True self.init_param_obj.fit_intercept = False w_ = self.initializer.init_model(model_shape, init_params=self.init_param_obj) embed_ = self.initializer.init_model([model_shape, self.init_param_obj.embed_size], init_params=self.init_param_obj) model_weights = \ FactorizationMachineWeights(w_, embed_, fit_intercept=fit_intercept) return model_weights
def __init_model(self, data_instances): model_shape = data_overview.get_features_shape(data_instances) LOGGER.info("Initialized model shape is {}".format(model_shape)) w = self.initializer.init_model(model_shape, init_params=self.init_param_obj) if self.fit_intercept: self.coef_ = w[:-1] self.intercept_ = w[-1] else: self.coef_ = w self.intercept_ = 0 # LOGGER.debug("Initialed model") return w
def compute_gradient(self, data_instances, fore_gradient, fit_intercept): """ Compute hetero-regression gradient Parameters ---------- data_instances: Table, input data fore_gradient: Table, fore_gradient fit_intercept: bool, if model has intercept or not Returns ---------- Table the hetero regression model's gradient """ feature_num = data_overview.get_features_shape(data_instances) data_count = data_instances.count() is_sparse = data_overview.is_sparse_data(data_instances) if data_count * feature_num > 100: LOGGER.debug("Use apply partitions") feat_join_grad = data_instances.join(fore_gradient, lambda d, g: (d.features, g)) f = functools.partial(self.__apply_cal_gradient, fixed_point_encoder=self.fixed_point_encoder, is_sparse=is_sparse) gradient_sum = feat_join_grad.applyPartitions(f) gradient_sum = gradient_sum.reduce(lambda x, y: x + y) if fit_intercept: # bias_grad = np.sum(fore_gradient) bias_grad = fore_gradient.reduce(lambda x, y: x + y) gradient_sum = np.append(gradient_sum, bias_grad) gradient = gradient_sum / data_count else: LOGGER.debug(f"Original_method") feat_join_grad = data_instances.join(fore_gradient, lambda d, g: (d.features, g)) f = functools.partial(self.__compute_partition_gradient, fit_intercept=fit_intercept, is_sparse=is_sparse) gradient_partition = feat_join_grad.applyPartitions(f) gradient_partition = gradient_partition.reduce(lambda x, y: x + y) gradient = gradient_partition / data_count return gradient
def fit(self, data): """ Apply standard scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale mean: list, each column mean value std: list, each column standard deviation """ if not self.with_mean and not self.with_std: shape = data_overview.get_features_shape(data) mean = [0 for _ in range(shape)] std = [1 for _ in range(shape)] return data, mean, std else: summary_obj = MultivariateStatisticalSummary(data, -1) mean = None std = None header = get_header(data) if self.with_mean: mean = summary_obj.get_mean() mean = [mean[key] for key in header] if self.with_std: std = summary_obj.get_std_variance() std = [std[key] for key in header] if not mean and std: mean = [0 for _ in std] elif mean and not std: std = [1 for _ in mean] if not mean or not std: raise ValueError("mean or std is None") f = functools.partial(self.__scale, mean=mean, std=std) data = data.mapValues(f) return data, mean, std
def predict(self, data_instances, predict_param): if not self.has_sychronized_encryption: self.__synchronize_encryption() self.__load_arbiter_model() else: LOGGER.info("in predict, has synchronize encryption information") from federatedml.statistic.data_overview import get_features_shape feature_shape = get_features_shape(data_instances) LOGGER.debug("Shape of coef_ : {}, feature shape: {}".format( len(self.coef_), feature_shape)) wx = self.compute_wx(data_instances, self.coef_, self.intercept_) if self.use_encrypt: encrypted_wx_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_wx) federation.remote(wx, name=self.transfer_variable.predict_wx.name, tag=encrypted_wx_id, role=consts.ARBITER, idx=0) predict_result_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_result) predict_result = federation.get( name=self.transfer_variable.predict_result.name, tag=predict_result_id, idx=0) # local_predict_table = predict_result.collect() predict_result_table = predict_result.join( data_instances, lambda p, d: (d.label, None, p)) else: pred_prob = wx.mapValues(lambda x: activation.sigmoid(x)) pred_label = self.classified(pred_prob, predict_param.threshold) if predict_param.with_proba: predict_result = data_instances.mapValues(lambda x: x.label) predict_result = predict_result.join(pred_prob, lambda x, y: (x, y)) else: predict_result = data_instances.mapValues(lambda x: (x.label, None)) predict_result_table = predict_result.join( pred_label, lambda x, y: (x[0], x[1], y)) return predict_result_table
def approxiQuantile(data_instances, cols, params): # cols == -1 means all features if cols == -1: features_shape = get_features_shape(data_instances) if features_shape is None: raise RuntimeError( 'Cannot get feature shape, please check input data') cols = [i for i in range(features_shape)] if isinstance(cols, int): cols = [cols] num_of_qs = len(cols) summary_list = [] for _ in range(num_of_qs): quantile_summaries = QuantileSummaries( compress_thres=params.compress_thres, head_size=params.head_size, error=params.error) summary_list.append(quantile_summaries) QuantileBinning.insert_datas(data_instances, summary_list, cols) return summary_list
def fit(self, data_instances): LOGGER.info("Enter hetero_lr_guest fit") self._abnormal_detection(data_instances) self.header = data_instances.schema.get("header") data_instances = data_instances.mapValues(HeteroLRGuest.load_data) public_key = federation.get( name=self.transfer_variable.paillier_pubkey.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey), idx=0) LOGGER.info("Get public_key from arbiter:{}".format(public_key)) self.encrypt_operator.set_public_key(public_key) LOGGER.info("Generate mini-batch from input data") mini_batch_obj = MiniBatch(data_instances, batch_size=self.batch_size) batch_num = mini_batch_obj.batch_nums if self.batch_size == -1: LOGGER.info( "batch size is -1, set it to the number of data in data_instances" ) self.batch_size = data_instances.count() batch_info = {"batch_size": self.batch_size, "batch_num": batch_num} LOGGER.info("batch_info:{}".format(batch_info)) federation.remote(batch_info, name=self.transfer_variable.batch_info.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.batch_info), role=consts.HOST, idx=0) LOGGER.info("Remote batch_info to Host") federation.remote(batch_info, name=self.transfer_variable.batch_info.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.batch_info), role=consts.ARBITER, idx=0) LOGGER.info("Remote batch_info to Arbiter") LOGGER.info("Start initialize model.") LOGGER.info("fit_intercept:{}".format( self.init_param_obj.fit_intercept)) model_shape = data_overview.get_features_shape(data_instances) weight = self.initializer.init_model(model_shape, init_params=self.init_param_obj) if self.init_param_obj.fit_intercept is True: self.coef_ = weight[:-1] self.intercept_ = weight[-1] else: self.coef_ = weight is_send_all_batch_index = False self.n_iter_ = 0 index_data_inst_map = {} while self.n_iter_ < self.max_iter: LOGGER.info("iter:{}".format(self.n_iter_)) # each iter will get the same batach_data_generator batch_data_generator = mini_batch_obj.mini_batch_data_generator( result='index') batch_index = 0 for batch_data_index in batch_data_generator: LOGGER.info("batch:{}".format(batch_index)) if not is_send_all_batch_index: LOGGER.info("remote mini-batch index to Host") federation.remote( batch_data_index, name=self.transfer_variable.batch_data_index.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.batch_data_index, self.n_iter_, batch_index), role=consts.HOST, idx=0) if batch_index >= mini_batch_obj.batch_nums - 1: is_send_all_batch_index = True # Get mini-batch train data if len(index_data_inst_map) < batch_num: batch_data_inst = data_instances.join( batch_data_index, lambda data_inst, index: data_inst) index_data_inst_map[batch_index] = batch_data_inst else: batch_data_inst = index_data_inst_map[batch_index] # transforms features of raw input 'batch_data_inst' into more representative features 'batch_feat_inst' batch_feat_inst = self.transform(batch_data_inst) # guest/host forward self.compute_forward(batch_feat_inst, self.coef_, self.intercept_) host_forward = federation.get( name=self.transfer_variable.host_forward_dict.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_forward_dict, self.n_iter_, batch_index), idx=0) LOGGER.info("Get host_forward from host") aggregate_forward_res = self.aggregate_forward(host_forward) en_aggregate_wx = aggregate_forward_res.mapValues( lambda v: v[0]) en_aggregate_wx_square = aggregate_forward_res.mapValues( lambda v: v[1]) # compute [[d]] if self.gradient_operator is None: self.gradient_operator = HeteroLogisticGradient( self.encrypt_operator) fore_gradient = self.gradient_operator.compute_fore_gradient( batch_feat_inst, en_aggregate_wx) federation.remote( fore_gradient, name=self.transfer_variable.fore_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.fore_gradient, self.n_iter_, batch_index), role=consts.HOST, idx=0) LOGGER.info("Remote fore_gradient to Host") # compute guest gradient and loss guest_gradient, loss = self.gradient_operator.compute_gradient_and_loss( batch_feat_inst, fore_gradient, en_aggregate_wx, en_aggregate_wx_square, self.fit_intercept) # loss regulation if necessary if self.updater is not None: guest_loss_regular = self.updater.loss_norm(self.coef_) loss += self.encrypt_operator.encrypt(guest_loss_regular) federation.remote( guest_gradient, name=self.transfer_variable.guest_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.guest_gradient, self.n_iter_, batch_index), role=consts.ARBITER, idx=0) LOGGER.info("Remote guest_gradient to arbiter") optim_guest_gradient = federation.get( name=self.transfer_variable.guest_optim_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.guest_optim_gradient, self.n_iter_, batch_index), idx=0) LOGGER.info("Get optim_guest_gradient from arbiter") # update model LOGGER.info("update_model") self.update_model(optim_guest_gradient) # update local model that transforms features of raw input 'batch_data_inst' training_info = { "iteration": self.n_iter_, "batch_index": batch_index } self.update_local_model(fore_gradient, batch_data_inst, self.coef_, **training_info) # Get loss regulation from Host if regulation is set if self.updater is not None: en_host_loss_regular = federation.get( name=self.transfer_variable.host_loss_regular.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_loss_regular, self.n_iter_, batch_index), idx=0) LOGGER.info("Get host_loss_regular from Host") loss += en_host_loss_regular federation.remote( loss, name=self.transfer_variable.loss.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.loss, self.n_iter_, batch_index), role=consts.ARBITER, idx=0) LOGGER.info("Remote loss to arbiter") # is converge of loss in arbiter batch_index += 1 is_stopped = federation.get( name=self.transfer_variable.is_stopped.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.is_stopped, self.n_iter_, batch_index), idx=0) LOGGER.info("Get is_stop flag from arbiter:{}".format(is_stopped)) self.n_iter_ += 1 if is_stopped: LOGGER.info( "Get stop signal from arbiter, model is converged, iter:{}" .format(self.n_iter_)) break LOGGER.info("Reach max iter {}, train model finish!".format( self.max_iter))
def fit(self, data): """ Apply standard scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale mean: list, each column mean value std: list, each column standard deviation """ if not self.with_mean and not self.with_std: shape = data_overview.get_features_shape(data) mean = [0 for _ in range(shape)] std = [1 for _ in range(shape)] self.scale_column_idx = [i for i in range(shape)] standard_scale_cols_conf = [mean, std, self.scale_column_idx] return data, standard_scale_cols_conf else: data_shape = data_overview.get_features_shape(data) if self.area == 'col': if isinstance(self.scale_column_idx, list): max_col_idx = max(self.scale_column_idx) if max_col_idx >= data_shape: raise ValueError( "max column index in area is:{}, should less than data shape:{}" .format(max_col_idx, data_shape)) self.scale_column_idx.sort() else: LOGGER.warning( "scale_column_idx should be a list, but not:{}, set scale column to all columns" .format(type(self.scale_column_idx))) self.scale_column_idx = [i for i in range(data_shape)] else: self.scale_column_idx = [i for i in range(data_shape)] self.scale_column_idx = list(set(self.scale_column_idx)) summary_obj = MultivariateStatisticalSummary(data, -1) mean = None std = None header = get_header(data) if self.with_mean: mean = summary_obj.get_mean() mean = [mean[key] for key in header] if self.with_std: std = summary_obj.get_std_variance() std = [std[key] for key in header] if not mean and std: mean = [0 for _ in std] elif mean and not std: std = [1 for _ in mean] if not mean or not std: raise ValueError("mean or std is None") f = functools.partial(self.__scale, mean=mean, std=std, process_cols_list=self.scale_column_idx) data = data.mapValues(f) standard_scale_cols_conf = [mean, std, self.scale_column_idx] return data, standard_scale_cols_conf
def predict(self, data_instances): if not self.need_run: return data_instances if not self.has_sychronized_encryption: self.__synchronize_encryption(mode='predict') self.__load_arbiter_model() else: LOGGER.info("in predict, has synchronize encryption information") feature_shape = get_features_shape(data_instances) LOGGER.debug("Shape of coef_ : {}, feature shape: {}".format( len(self.coef_), feature_shape)) local_data = data_instances.first() LOGGER.debug("One data, features: {}".format(local_data[1].features)) wx = self.compute_wx(data_instances, self.coef_, self.intercept_) if self.use_encrypt: encrypted_wx_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_wx) LOGGER.debug("Host encrypted wx id: {}".format(encrypted_wx_id)) LOGGER.debug("Start to remote wx: {}, transfer_id: {}".format( wx, encrypted_wx_id)) federation.remote(wx, name=self.transfer_variable.predict_wx.name, tag=encrypted_wx_id, role=consts.ARBITER, idx=0) predict_result_id = self.transfer_variable.generate_transferid( self.transfer_variable.predict_result) LOGGER.debug("predict_result_id: {}".format(predict_result_id)) predict_result = federation.get( name=self.transfer_variable.predict_result.name, tag=predict_result_id, idx=0) # local_predict_table = predict_result.collect() LOGGER.debug( "predict_result count: {}, data_instances count: {}".format( predict_result.count(), data_instances.count())) predict_result_table = predict_result.join( data_instances, lambda p, d: [d.label, None, p, { "0": None, "1": None }]) else: pred_prob = wx.mapValues(lambda x: activation.sigmoid(x)) pred_label = self.classified(pred_prob, self.predict_param.threshold) if self.predict_param.with_proba: predict_result = data_instances.mapValues(lambda x: x.label) predict_result = predict_result.join(pred_prob, lambda x, y: (x, y)) else: predict_result = data_instances.mapValues(lambda x: (x.label, None)) predict_result_table = predict_result.join( pred_label, lambda x, y: [x[0], y, x[1], { "0": None, "1": None }]) LOGGER.debug("Finish predict") LOGGER.debug("In host predict, predict_result_table is : {}".format( predict_result_table.first())) return predict_result_table
def fit(self, data_instances): LOGGER.info("Enter hetero_lr host") self._abnormal_detection(data_instances) self.header = data_instances.schema.get("header") public_key = federation.get( name=self.transfer_variable.paillier_pubkey.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.paillier_pubkey), idx=0) LOGGER.info("Get public_key from arbiter:{}".format(public_key)) self.encrypt_operator.set_public_key(public_key) batch_info = federation.get( name=self.transfer_variable.batch_info.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.batch_info), idx=0) LOGGER.info("Get batch_info from guest:" + str(batch_info)) self.batch_size = batch_info["batch_size"] self.batch_num = batch_info["batch_num"] LOGGER.info("Start initialize model.") model_shape = data_overview.get_features_shape(data_instances) if self.init_param_obj.fit_intercept: self.init_param_obj.fit_intercept = False if self.fit_intercept: self.fit_intercept = False self.coef_ = self.initializer.init_model( model_shape, init_params=self.init_param_obj) self.n_iter_ = 0 index_data_inst_map = {} while self.n_iter_ < self.max_iter: LOGGER.info("iter:" + str(self.n_iter_)) batch_index = 0 while batch_index < self.batch_num: LOGGER.info("batch:{}".format(batch_index)) # set batch_data if len(self.batch_index_list) < self.batch_num: batch_data_index = federation.get( name=self.transfer_variable.batch_data_index.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.batch_data_index, self.n_iter_, batch_index), idx=0) LOGGER.info("Get batch_index from Guest") batch_size = batch_data_index.count() if batch_size < consts.MIN_BATCH_SIZE and batch_size != -1: raise ValueError( "Batch size get from guest should not less than 10, except -1, batch_size is {}" .format(batch_size)) self.batch_index_list.append(batch_data_index) else: batch_data_index = self.batch_index_list[batch_index] # Get mini-batch train data if len(index_data_inst_map) < self.batch_num: batch_data_inst = batch_data_index.join( data_instances, lambda g, d: d) index_data_inst_map[batch_index] = batch_data_inst else: batch_data_inst = index_data_inst_map[batch_index] LOGGER.info("batch_data_inst size:{}".format( batch_data_inst.count())) # transforms features of raw input 'batch_data_inst' into more representative features 'batch_feat_inst' batch_feat_inst = self.transform(batch_data_inst) # compute forward host_forward = self.compute_forward(batch_feat_inst, self.coef_, self.intercept_) federation.remote( host_forward, name=self.transfer_variable.host_forward_dict.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_forward_dict, self.n_iter_, batch_index), role=consts.GUEST, idx=0) LOGGER.info("Remote host_forward to guest") # compute host gradient fore_gradient = federation.get( name=self.transfer_variable.fore_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.fore_gradient, self.n_iter_, batch_index), idx=0) LOGGER.info("Get fore_gradient from guest") if self.gradient_operator is None: self.gradient_operator = HeteroLogisticGradient( self.encrypt_operator) host_gradient = self.gradient_operator.compute_gradient( batch_feat_inst, fore_gradient, fit_intercept=False) # regulation if necessary if self.updater is not None: loss_regular = self.updater.loss_norm(self.coef_) en_loss_regular = self.encrypt_operator.encrypt( loss_regular) federation.remote( en_loss_regular, name=self.transfer_variable.host_loss_regular.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_loss_regular, self.n_iter_, batch_index), role=consts.GUEST, idx=0) LOGGER.info("Remote host_loss_regular to guest") federation.remote( host_gradient, name=self.transfer_variable.host_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_gradient, self.n_iter_, batch_index), role=consts.ARBITER, idx=0) LOGGER.info("Remote host_gradient to arbiter") # Get optimize host gradient and update model optim_host_gradient = federation.get( name=self.transfer_variable.host_optim_gradient.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.host_optim_gradient, self.n_iter_, batch_index), idx=0) LOGGER.info("Get optim_host_gradient from arbiter") LOGGER.info("update_model") self.update_model(optim_host_gradient) # update local model that transforms features of raw input 'batch_data_inst' training_info = { "iteration": self.n_iter_, "batch_index": batch_index } self.update_local_model(fore_gradient, batch_data_inst, self.coef_, **training_info) # is converge batch_index += 1 # if is_stopped: # break is_stopped = federation.get( name=self.transfer_variable.is_stopped.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.is_stopped, self.n_iter_, batch_index), idx=0) LOGGER.info("Get is_stop flag from arbiter:{}".format(is_stopped)) self.n_iter_ += 1 if is_stopped: LOGGER.info( "Get stop signal from arbiter, model is converged, iter:{}" .format(self.n_iter_)) break LOGGER.info("Reach max iter {}, train model finish!".format( self.max_iter))
def fit(self, data_instances): LOGGER.info("开始纵向逻辑回归") #检查数据 self._abnormal_detection(data_instances) #导入数据 data_instances = data_instances.mapValues(HdpVflHost.load_data) # 下面开始模型的初始化 data_shape = data_overview.get_features_shape(data_instances) LOGGER.info("数据的维度是:{}".format(data_shape)) self.model = LRModelWeightsHost() self.model.initialize(data_shape) #批处理模块初始化 self.batch_generator.register_batch_generator(self.transfer_variable) suffix = (data_instances.count(), self.r) self.batch_generator.initialize_batch_generator(data_instances, suffix=suffix) #传输变量初始化 self.register_gradient_sync(self.transfer_variable) #开始正式的循环迭代的阶段 iteration = 0 test_suffix = ("iter", ) while iteration <= self.e: for data_inst in self.batch_generator.generator_batch_data(): LOGGER.info("开始计算数据的内积") ir_b = self.model.compute_forwards(data_inst, self.model.w) LOGGER.info("开始生成高斯分布需要的:loc、sigma") loc, sigma = self.model.gaussian(self.delta, self.epsilon, self.L, self.e, int(self.r * self.e), self.learning_rate, data_inst.count(), self.k) LOGGER.info("开始对数据添加噪声") sec_ir_b = self.model.sec_intermediate_result(ir_b, loc, sigma) suffix_t = test_suffix + (iteration, ) LOGGER.info("当前的suffix_t值为:{}".format(suffix_t)) LOGGER.info("开始发送给guest端sec_it_b") # test_transfer.send(obj=sec_ir_b,role=consts.GUEST,suffix=suffix_t) self.ir_b.remote(obj=sec_ir_b, role=consts.GUEST, suffix=suffix_t) LOGGER.info("开始从guest端接收sec_ir_a") sec_ir_a = self.ir_a.get(suffix=suffix_t) LOGGER.info("开始计算gradient_b") gradient_b = self.model.compute_gradient( data_inst, sec_ir_a[0], data_inst.count()) LOGGER.info("开始更新模型参数") self.model.update_model(gradient_b, self.learning_rate, self.lamb) LOGGER.info("开始进行梯度剪切部分") self.model.norm_clip(self.k) iteration += 1 LOGGER.info("训练正式结束") LOGGER.info("host方的模型参数:{}".format(self.model.w)) return self.model.w
def _get_data_shape(self, data): if not self.data_shape: self.data_shape = data_overview.get_features_shape(data) return self.data_shape
def get_features_shape(self, data_instances): if self.feature_shape is not None: return self.feature_shape return data_overview.get_features_shape(data_instances)