class HeteroSecureBoostingTreeGuest(BoostingTree): def __init__(self, secureboost_tree_param): super(HeteroSecureBoostingTreeGuest, self).__init__(secureboost_tree_param) self.convegence = None self.y = None self.F = None self.data_bin = None self.loss = None self.init_score = None self.classes_dict = {} self.classes_ = [] self.num_classes = 0 self.classify_target = "binary" self.feature_num = None self.encrypter = None self.grad_and_hess = None self.flowid = 0 self.tree_dim = 1 self.tree_meta = None self.trees_ = [] self.history_loss = [] self.bin_split_points = None self.bin_sparse_points = None self.transfer_inst = HeteroSecureBoostingTreeTransferVariable() def set_loss(self, objective_param): loss_type = objective_param.objective params = objective_param.params LOGGER.info("set objective, objective is {}".format(loss_type)) if self.task_type == consts.CLASSIFICATION: if loss_type == "cross_entropy": if self.num_classes == 2: self.loss = SigmoidBinaryCrossEntropyLoss() else: self.loss = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) elif self.task_type == consts.REGRESSION: if loss_type == "lse": self.loss = LeastSquaredErrorLoss() elif loss_type == "lae": self.loss = LeastAbsoluteErrorLoss() elif loss_type == "huber": self.loss = HuberLoss(params[0]) elif loss_type == "fair": self.loss = FairLoss(params[0]) elif loss_type == "tweedie": self.loss = TweedieLoss(params[0]) elif loss_type == "log_cosh": self.loss = LogCoshLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") self.data_bin, self.bin_split_points, self.bin_sparse_points = \ Quantile.convert_feature_to_bin( data_instance, self.quantile_method, self.bin_num, self.bin_gap, self.bin_sample_num) LOGGER.info("convert feature to bins over") def set_y(self): LOGGER.info("set label from data and check label") self.y = self.data_bin.mapValues(lambda instance: instance.label) self.check_label() def set_flowid(self, flowid=0): LOGGER.info("set flowid, flowid is {}".format(flowid)) self.flowid = flowid def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid") return ".".join(map(str, [self.flowid, round_num, tree_num])) def check_label(self): LOGGER.info("check label") if self.task_type == consts.CLASSIFICATION: self.num_classes, self.classes_ = ClassifyLabelChecker.validate_y( self.y) if self.num_classes > 2: self.classify_target = "multinomial" self.tree_dim = self.num_classes range_from_zero = True for _class in self.classes_: try: if _class >= 0 and _class < range_from_zero and isinstance( _class, int): continue else: range_from_zero = False break except: range_from_zero = False self.classes_ = sorted(self.classes_) if not range_from_zero: class_mapping = dict( zip(self.classes_, range(self.num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_y(self.y) self.set_loss(self.objective_param) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method == consts.PAILLIER: self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yes!!!") @staticmethod def accumulate_f(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_f_value(self, new_f=None, tidx=-1): LOGGER.info("update tree f value, tree idx is {}".format(tidx)) if self.F is None: if self.tree_dim > 1: self.F, self.init_score = self.loss.initialize( self.y, self.tree_dim) else: LOGGER.info("tree_dim is %d" % (self.tree_dim)) self.F, self.init_score = self.loss.initialize(self.y) else: accumuldate_f = functools.partial(self.accumulate_f, lr=self.learning_rate, idx=tidx) self.F = self.F.join(new_f, accumuldate_f) def compute_grad_and_hess(self): LOGGER.info("compute grad and hess") loss_method = self.loss if self.task_type == consts.CLASSIFICATION: self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \ (loss_method.compute_grad(y, loss_method.predict(f_val)), \ loss_method.compute_hess(y, loss_method.predict(f_val)))) else: self.grad_and_hess = self.y.join( self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val), loss_method.compute_hess(y, f_val))) def compute_loss(self): LOGGER.info("compute loss") if self.task_type == consts.CLASSIFICATION: loss_method = self.loss y_predict = self.F.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber" ]: loss_method = self.loss loss = loss_method.compute_loss(self.y, self.F) else: loss_method = self.loss y_predict = self.F.mapValues( lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) return loss def get_grad_and_hess(self, tree_idx): LOGGER.info("get grad and hess of tree {}".format(tree_idx)) grad_and_hess_subtree = self.grad_and_hess.mapValues( lambda grad_and_hess: (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx])) return grad_and_hess_subtree def check_convergence(self, loss): LOGGER.info("check convergence") if self.convegence is None: self.convegence = DiffConverge() return self.convegence.is_converge(loss) def sample_valid_features(self): LOGGER.info("sample valid features") if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] choose_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in choose_feature: valid_features[fid] = True return valid_features def sync_tree_dim(self): LOGGER.info("sync tree dim to host") federation.remote(obj=self.tree_dim, name=self.transfer_inst.tree_dim.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.tree_dim), role=consts.HOST, idx=0) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info( "sync stop flag to host, boosting round is {}".format(num_round)) federation.remote(obj=stop_flag, name=self.transfer_inst.stop_flag.name, tag=self.transfer_inst.generate_transferid( self.transfer_inst.stop_flag, num_round), role=consts.HOST, idx=0) def fit(self, data_inst): LOGGER.info("begin to train secureboosting guest model") data_inst = self.data_alignment(data_inst) self.convert_feature_to_bin(data_inst) self.set_y() self.update_f_value() self.generate_encrypter() self.sync_tree_dim() for i in range(self.num_trees): # n_tree = [] self.compute_grad_and_hess() for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_inputinfo(self.data_bin, self.get_grad_and_hess(tidx), self.bin_split_points, self.bin_sparse_points) valid_features = self.sample_valid_features() tree_inst.set_valid_features(valid_features) tree_inst.set_encrypter(self.encrypter) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.fit() tree_meta, tree_param = tree_inst.get_model() self.trees_.append(tree_param) if self.tree_meta is None: self.tree_meta = tree_meta # n_tree.append(tree_inst.get_tree_model()) self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx) # self.trees_.append(n_tree) loss = self.compute_loss() self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(i, loss)) if self.n_iter_no_change is True: if self.check_convergence(loss): self.sync_stop_flag(True, i) break else: self.sync_stop_flag(False, i) LOGGER.info("end to train secureboosting guest model") def predict_f_value(self, data_inst): LOGGER.info("predict tree f value, there are {} trees".format( len(self.trees_))) tree_dim = self.tree_dim init_score = self.init_score self.F = data_inst.mapValues(lambda v: init_score) rounds = len(self.trees_) // self.tree_dim for i in range(rounds): for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.load_model(self.tree_meta, self.trees_[i * self.tree_dim + tidx]) # tree_inst.set_tree_model(self.trees_[i * self.tree_dim + tidx]) tree_inst.set_flowid(self.generate_flowid(i, tidx)) predict_data = tree_inst.predict(data_inst) self.update_f_value(new_f=predict_data, tidx=tidx) def predict(self, data_inst, predict_param): LOGGER.info("start predict") data_inst = self.data_alignment(data_inst) self.predict_f_value(data_inst) if self.task_type == consts.CLASSIFICATION: loss_method = self.loss predicts = self.F.mapValues(lambda f: loss_method.predict(f)) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "huber", "log_cosh", "fair", "tweedie" ]: predicts = self.F else: raise NotImplementedError( "objective {} not supprted yet".format( self.objective_param.objective)) if self.task_type == consts.CLASSIFICATION: classes_ = self.classes_ if self.num_classes == 2: predict_label = predicts.mapValues(lambda pred: classes_[ 1] if pred > predict_param.threshold else classes_[0]) else: predict_label = predicts.mapValues( lambda preds: classes_[np.argmax(preds)]) if predict_param.with_proba: predict_result = data_inst.join( predicts, lambda inst, predict_prob: (inst.label, predict_prob)) else: predict_result = data_inst.mapValues(lambda inst: (inst.label, None)) predict_result = predict_result.join( predict_label, lambda label_prob, predict_label: (label_prob[0], label_prob[1], predict_label)) elif self.task_type == consts.REGRESSION: predict_result = data_inst.join( predicts, lambda inst, pred: (inst.label, pred, None)) else: raise NotImplementedError("task type {} not supported yet".format( self.task_type)) LOGGER.info("end predict") return predict_result def get_model_meta(self): model_meta = BoostingTreeModelMeta() model_meta.tree_meta.CopyFrom(self.tree_meta) model_meta.learning_rate = self.learning_rate model_meta.num_trees = self.num_trees model_meta.quantile_meta.CopyFrom( QuantileMeta(quantile_method=self.quantile_method, bin_num=self.bin_num, bin_gap=self.bin_gap, bin_sample_num=self.bin_sample_num)) #modelmeta.objective.CopyFrom(ObjectiveParamMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.objective_meta.CopyFrom( ObjectiveMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.task_type = self.task_type model_meta.tree_dim = self.tree_dim model_meta.n_iter_no_change = self.n_iter_no_change model_meta.tol = self.tol model_meta.num_classes = self.num_classes model_meta.classes_.extend(map(str, self.classes_)) meta_name = "HeteroSecureBoostingTreeGuest.meta" return meta_name, model_meta def set_model_meta(self, model_meta): self.tree_meta = model_meta.tree_meta self.learning_rate = model_meta.learning_rate self.num_trees = model_meta.num_trees self.quantile_method = model_meta.quantile_meta.quantile_method self.bin_num = model_meta.quantile_meta.bin_num self.bin_gap = model_meta.quantile_meta.bin_gap self.bin_sample_num = model_meta.quantile_meta.bin_sample_num self.objective_param.objective = model_meta.objective_meta.objective self.objective_param.params = list(model_meta.objective_meta.param) self.task_type = model_meta.task_type self.tree_dim = model_meta.tree_dim self.num_classes = model_meta.num_classes self.n_iter_no_change = model_meta.n_iter_no_change self.tol = model_meta.tol self.classes_ = list(model_meta.classes_) self.set_loss(self.objective_param) def get_model_param(self): model_param = BoostingTreeModelParam() model_param.tree_num = len(list(self.trees_)) model_param.trees_.extend(self.trees_) model_param.init_score.extend(self.init_score) model_param.losses.extend(self.history_loss) param_name = "HeteroSecureBoostingTreeGuest.param" return param_name, model_param def set_model_param(self, model_param): self.trees_ = list(model_param.trees_) self.init_score = np.array(list(model_param.init_score)) self.history_loss = list(model_param.losses) def save_model(self, model_table, model_namespace): LOGGER.info("save model") meta_name, meta_protobuf = self.get_model_meta() param_name, param_protobuf = self.get_model_param() manager.save_model(buffer_type=meta_name, proto_buffer=meta_protobuf, name=model_table, namespace=model_namespace) manager.save_model(buffer_type=param_name, proto_buffer=param_protobuf, name=model_table, namespace=model_namespace) return [(meta_name, param_name)] def load_model(self, model_table, model_namespace): LOGGER.info("load model") model_meta = BoostingTreeModelMeta() manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.meta", proto_buffer=model_meta, name=model_table, namespace=model_namespace) self.set_model_meta(model_meta) model_param = BoostingTreeModelParam() manager.read_model(buffer_type="HeteroSecureBoostingTreeGuest.param", proto_buffer=model_param, name=model_table, namespace=model_namespace) self.set_model_param(model_param) def evaluate(self, labels, pred_prob, pred_labels, evaluate_param): LOGGER.info("evaluate data") predict_res = None if self.task_type == consts.CLASSIFICATION: if evaluate_param.classi_type == consts.BINARY: predict_res = pred_prob elif evaluate_param.classi_type == consts.MULTY: predict_res = pred_labels else: LOGGER.warning( "unknown classification type, return None as evaluation results" ) elif self.task_type == consts.REGRESSION: predict_res = pred_prob else: LOGGER.warning( "unknown task type, return None as evaluation results") eva = Evaluation(evaluate_param.classi_type) return eva.report(labels, predict_res, evaluate_param.metrics, evaluate_param.thresholds, evaluate_param.pos_label)
class HeteroSecureBoostingTreeGuest(BoostingTree): def __init__(self): super(HeteroSecureBoostingTreeGuest, self).__init__() self.convegence = None self.y = None self.F = None self.predict_F = None self.data_bin = None self.loss = None self.init_score = None self.classes_dict = {} self.classes_ = [] self.num_classes = 0 self.classify_target = "binary" self.feature_num = None self.encrypter = None self.grad_and_hess = None self.tree_dim = 1 self.tree_meta = None self.trees_ = [] self.history_loss = [] self.bin_split_points = None self.bin_sparse_points = None self.encrypted_mode_calculator = None self.feature_importances_ = {} self.role = consts.GUEST self.transfer_variable = HeteroSecureBoostingTreeTransferVariable() def set_loss(self, objective_param): loss_type = objective_param.objective params = objective_param.params LOGGER.info("set objective, objective is {}".format(loss_type)) if self.task_type == consts.CLASSIFICATION: if loss_type == "cross_entropy": if self.num_classes == 2: self.loss = SigmoidBinaryCrossEntropyLoss() else: self.loss = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) elif self.task_type == consts.REGRESSION: if loss_type == "lse": self.loss = LeastSquaredErrorLoss() elif loss_type == "lae": self.loss = LeastAbsoluteErrorLoss() elif loss_type == "huber": self.loss = HuberLoss(params[0]) elif loss_type == "fair": self.loss = FairLoss(params[0]) elif loss_type == "tweedie": self.loss = TweedieLoss(params[0]) elif loss_type == "log_cosh": self.loss = LogCoshLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) if self.use_missing: binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()]) else: binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin( data_instance) LOGGER.info("convert feature to bins over") def set_y(self): LOGGER.info("set label from data and check label") self.y = self.data_bin.mapValues(lambda instance: instance.label) self.check_label() def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid, flowid {}".format(self.flowid)) return ".".join(map(str, [self.flowid, round_num, tree_num])) def check_label(self): LOGGER.info("check label") if self.task_type == consts.CLASSIFICATION: self.num_classes, self.classes_ = ClassifyLabelChecker.validate_label( self.data_bin) if self.num_classes > 2: self.classify_target = "multinomial" self.tree_dim = self.num_classes range_from_zero = True for _class in self.classes_: try: if _class >= 0 and _class < self.num_classes and isinstance( _class, int): continue else: range_from_zero = False break except: range_from_zero = False self.classes_ = sorted(self.classes_) if not range_from_zero: class_mapping = dict( zip(self.classes_, range(self.num_classes))) self.y = self.y.mapValues(lambda _class: class_mapping[_class]) else: RegressionLabelChecker.validate_label(self.data_bin) self.set_loss(self.objective_param) def generate_encrypter(self): LOGGER.info("generate encrypter") if self.encrypt_param.method.lower() == consts.PAILLIER.lower(): self.encrypter = PaillierEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) elif self.encrypt_param.method.lower() == consts.ITERATIVEAFFINE.lower( ): self.encrypter = IterativeAffineEncrypt() self.encrypter.generate_key(self.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yes!!!") self.encrypted_calculator = EncryptModeCalculator( self.encrypter, self.calculated_mode, self.re_encrypted_rate) @staticmethod def accumulate_f(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_feature_importance(self, tree_feature_importance): for fid in tree_feature_importance: if fid not in self.feature_importances_: self.feature_importances_[fid] = 0 self.feature_importances_[fid] += tree_feature_importance[fid] def update_f_value(self, new_f=None, tidx=-1, mode="train"): LOGGER.info("update tree f value, tree idx is {}".format(tidx)) if mode == "train" and self.F is None: if self.tree_dim > 1: self.F, self.init_score = self.loss.initialize( self.y, self.tree_dim) else: self.F, self.init_score = self.loss.initialize(self.y) else: accumulate_f = functools.partial(self.accumulate_f, lr=self.learning_rate, idx=tidx) if mode == "train": self.F = self.F.join(new_f, accumulate_f) else: self.predict_F = self.predict_F.join(new_f, accumulate_f) def compute_grad_and_hess(self): LOGGER.info("compute grad and hess") loss_method = self.loss if self.task_type == consts.CLASSIFICATION: self.grad_and_hess = self.y.join(self.F, lambda y, f_val: \ (loss_method.compute_grad(y, loss_method.predict(f_val)), \ loss_method.compute_hess(y, loss_method.predict(f_val)))) else: self.grad_and_hess = self.y.join( self.F, lambda y, f_val: (loss_method.compute_grad(y, f_val), loss_method.compute_hess(y, f_val))) def compute_loss(self): LOGGER.info("compute loss") if self.task_type == consts.CLASSIFICATION: loss_method = self.loss y_predict = self.F.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber" ]: loss_method = self.loss loss = loss_method.compute_loss(self.y, self.F) else: loss_method = self.loss y_predict = self.F.mapValues( lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(self.y, y_predict) return float(loss) def get_grad_and_hess(self, tree_idx): LOGGER.info("get grad and hess of tree {}".format(tree_idx)) grad_and_hess_subtree = self.grad_and_hess.mapValues( lambda grad_and_hess: (grad_and_hess[0][tree_idx], grad_and_hess[1][tree_idx])) return grad_and_hess_subtree def check_convergence(self, loss): LOGGER.info("check convergence") if self.convegence is None: self.convegence = converge_func_factory("diff", self.tol) return self.convegence.is_converge(loss) def sample_valid_features(self): LOGGER.info("sample valid features") if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] choose_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in choose_feature: valid_features[fid] = True return valid_features def sync_tree_dim(self): LOGGER.info("sync tree dim to host") self.transfer_variable.tree_dim.remote(self.tree_dim, role=consts.HOST, idx=-1) def sync_stop_flag(self, stop_flag, num_round): LOGGER.info( "sync stop flag to host, boosting round is {}".format(num_round)) self.transfer_variable.stop_flag.remote(stop_flag, role=consts.HOST, idx=-1, suffix=(num_round, )) def fit(self, data_inst, validate_data=None): LOGGER.info("begin to train secureboosting guest model") self.gen_feature_fid_mapping(data_inst.schema) data_inst = self.data_alignment(data_inst) self.convert_feature_to_bin(data_inst) self.set_y() self.update_f_value() self.generate_encrypter() self.sync_tree_dim() self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"unit_name": "iters"})) validation_strategy = self.init_validation_strategy( data_inst, validate_data) for i in range(self.num_trees): self.compute_grad_and_hess() for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.set_inputinfo(self.data_bin, self.get_grad_and_hess(tidx), self.bin_split_points, self.bin_sparse_points) valid_features = self.sample_valid_features() tree_inst.set_valid_features(valid_features) tree_inst.set_encrypter(self.encrypter) tree_inst.set_encrypted_mode_calculator( self.encrypted_calculator) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.set_host_party_idlist( self.component_properties.host_party_idlist) tree_inst.set_runtime_idx( self.component_properties.local_partyid) tree_inst.fit() tree_meta, tree_param = tree_inst.get_model() self.trees_.append(tree_param) if self.tree_meta is None: self.tree_meta = tree_meta self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx) self.update_feature_importance( tree_inst.get_feature_importance()) loss = self.compute_loss() self.history_loss.append(loss) LOGGER.info("round {} loss is {}".format(i, loss)) LOGGER.debug("type of loss is {}".format(type(loss).__name__)) self.callback_metric("loss", "train", [Metric(i, loss)]) if validation_strategy: validation_strategy.validate(self, i) if self.n_iter_no_change is True: if self.check_convergence(loss): self.sync_stop_flag(True, i) break else: self.sync_stop_flag(False, i) LOGGER.debug("history loss is {}".format(min(self.history_loss))) self.callback_meta( "loss", "train", MetricMeta(name="train", metric_type="LOSS", extra_metas={"Best": min(self.history_loss)})) LOGGER.info("end to train secureboosting guest model") def predict_f_value(self, data_inst): LOGGER.info("predict tree f value, there are {} trees".format( len(self.trees_))) tree_dim = self.tree_dim init_score = self.init_score self.predict_F = data_inst.mapValues(lambda v: init_score) rounds = len(self.trees_) // self.tree_dim for i in range(rounds): for tidx in range(self.tree_dim): tree_inst = HeteroDecisionTreeGuest(self.tree_param) tree_inst.load_model(self.tree_meta, self.trees_[i * self.tree_dim + tidx]) # tree_inst.set_tree_model(self.trees_[i * self.tree_dim + tidx]) tree_inst.set_flowid(self.generate_flowid(i, tidx)) tree_inst.set_runtime_idx( self.component_properties.local_partyid) tree_inst.set_host_party_idlist( self.component_properties.host_party_idlist) predict_data = tree_inst.predict(data_inst) self.update_f_value(new_f=predict_data, tidx=tidx, mode="predict") def predict(self, data_inst): LOGGER.info("start predict") data_inst = self.data_alignment(data_inst) self.predict_f_value(data_inst) if self.task_type == consts.CLASSIFICATION: loss_method = self.loss if self.num_classes == 2: predicts = self.predict_F.mapValues( lambda f: float(loss_method.predict(f))) else: predicts = self.predict_F.mapValues( lambda f: loss_method.predict(f).tolist()) elif self.task_type == consts.REGRESSION: if self.objective_param.objective in [ "lse", "lae", "huber", "log_cosh", "fair", "tweedie" ]: predicts = self.predict_F else: raise NotImplementedError( "objective {} not supprted yet".format( self.objective_param.objective)) if self.task_type == consts.CLASSIFICATION: classes_ = self.classes_ if self.num_classes == 2: threshold = self.predict_param.threshold predict_result = data_inst.join( predicts, lambda inst, pred: [ inst.label, classes_[1] if pred > threshold else classes_[0], pred, { "0": 1 - pred, "1": pred } ]) else: predict_label = predicts.mapValues( lambda preds: classes_[np.argmax(preds)]) predict_result = data_inst.join( predicts, lambda inst, preds: [ inst.label, classes_[np.argmax(preds)], np.max(preds), dict(zip(map(str, classes_), preds)) ]) elif self.task_type == consts.REGRESSION: predict_result = data_inst.join( predicts, lambda inst, pred: [inst.label, float(pred), float(pred), { "label": float(pred) }]) else: raise NotImplementedError("task type {} not supported yet".format( self.task_type)) LOGGER.info("end predict") return predict_result def get_feature_importance(self): return self.feature_importances_ def get_model_meta(self): model_meta = BoostingTreeModelMeta() model_meta.tree_meta.CopyFrom(self.tree_meta) model_meta.learning_rate = self.learning_rate model_meta.num_trees = self.num_trees model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) model_meta.objective_meta.CopyFrom( ObjectiveMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.task_type = self.task_type # model_meta.tree_dim = self.tree_dim model_meta.n_iter_no_change = self.n_iter_no_change model_meta.tol = self.tol # model_meta.num_classes = self.num_classes # model_meta.classes_.extend(map(str, self.classes_)) # model_meta.need_run = self.need_run meta_name = "HeteroSecureBoostingTreeGuestMeta" return meta_name, model_meta def set_model_meta(self, model_meta): self.tree_meta = model_meta.tree_meta self.learning_rate = model_meta.learning_rate self.num_trees = model_meta.num_trees self.bin_num = model_meta.quantile_meta.bin_num self.objective_param.objective = model_meta.objective_meta.objective self.objective_param.params = list(model_meta.objective_meta.param) self.task_type = model_meta.task_type # self.tree_dim = model_meta.tree_dim # self.num_classes = model_meta.num_classes self.n_iter_no_change = model_meta.n_iter_no_change self.tol = model_meta.tol # self.classes_ = list(model_meta.classes_) # self.set_loss(self.objective_param) def get_model_param(self): model_param = BoostingTreeModelParam() model_param.tree_num = len(list(self.trees_)) model_param.tree_dim = self.tree_dim model_param.trees_.extend(self.trees_) model_param.init_score.extend(self.init_score) model_param.losses.extend(self.history_loss) model_param.classes_.extend(map(str, self.classes_)) model_param.num_classes = self.num_classes feature_importances = list(self.get_feature_importance().items()) feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True) feature_importance_param = [] for (sitename, fid), _importance in feature_importances: feature_importance_param.append( FeatureImportanceInfo(sitename=sitename, fid=fid, importance=_importance)) model_param.feature_importances.extend(feature_importance_param) model_param.feature_name_fid_mapping.update( self.feature_name_fid_mapping) param_name = "HeteroSecureBoostingTreeGuestParam" return param_name, model_param def set_model_param(self, model_param): self.trees_ = list(model_param.trees_) self.init_score = np.array(list(model_param.init_score)) self.history_loss = list(model_param.losses) self.classes_ = list(model_param.classes_) self.tree_dim = model_param.tree_dim self.num_classes = model_param.num_classes self.feature_name_fid_mapping.update( model_param.feature_name_fid_mapping) def get_metrics_param(self): if self.task_type == consts.CLASSIFICATION: if self.num_classes == 2: return EvaluateParam(eval_type="binary", pos_label=self.classes_[1]) else: return EvaluateParam(eval_type="multi") else: return EvaluateParam(eval_type="regression") def export_model(self): if self.need_cv: return None meta_name, meta_protobuf = self.get_model_meta() param_name, param_protobuf = self.get_model_param() self.model_output = { meta_name: meta_protobuf, param_name: param_protobuf } return self.model_output def load_model(self, model_dict): model_param = None model_meta = None for _, value in model_dict["model"].items(): for model in value: if model.endswith("Meta"): model_meta = value[model] if model.endswith("Param"): model_param = value[model] LOGGER.info("load model") self.set_model_meta(model_meta) self.set_model_param(model_param) self.set_loss(self.objective_param)
class HomoSecureBoostingTreeClient(BoostingTree): def __init__(self): super(HomoSecureBoostingTreeClient, self).__init__() self.mode = consts.H**O self.validation_strategy = None self.loss_fn = None self.cur_sample_weights = None self.y = None self.y_hat = None self.y_hat_predict = None self.feature_num = None self.num_classes = 2 self.tree_dim = 1 self.trees = [] self.feature_importance = {} self.transfer_inst = HomoSecureBoostingTreeTransferVariable() self.role = None self.data_bin = None self.bin_split_points = None self.bin_sparse_points = None self.init_score = None self.local_loss_history = [] self.classes_ = [] self.role = consts.GUEST # store learnt model param self.tree_meta = None self.learnt_tree_param = [] self.aggregator = SecureBoostClientAggregator() self.binning_obj = HomoFeatureBinningClient() def set_loss_function(self, objective_param): loss_type = objective_param.objective params = objective_param.params LOGGER.info("set objective, objective is {}".format(loss_type)) if self.task_type == consts.CLASSIFICATION: if loss_type == "cross_entropy": if self.num_classes == 2: self.loss_fn = SigmoidBinaryCrossEntropyLoss() else: self.loss_fn = SoftmaxCrossEntropyLoss() else: raise NotImplementedError("objective %s not supported yet" % (loss_type)) elif self.task_type == consts.REGRESSION: if loss_type == "lse": self.loss_fn = LeastSquaredErrorLoss() elif loss_type == "lae": self.loss_fn = LeastAbsoluteErrorLoss() elif loss_type == "huber": self.loss_fn = HuberLoss(params[0]) elif loss_type == "fair": self.loss_fn = FairLoss(params[0]) elif loss_type == "tweedie": self.loss_fn = TweedieLoss(params[0]) elif loss_type == "log_cosh": self.loss_fn = LogCoshLoss() else: raise NotImplementedError("objective %s not supported yet" % loss_type) else: raise NotImplementedError("objective %s not supported yet" % loss_type) def federated_binning(self, data_instance): if self.use_missing: binning_result = self.binning_obj.average_run( data_instances=data_instance, bin_num=self.bin_num, abnormal_list=[NoneType()]) else: binning_result = self.binning_obj.average_run( data_instances=data_instance, bin_num=self.bin_num) return self.binning_obj.convert_feature_to_bin(data_instance, binning_result) def compute_local_grad_and_hess(self, y_hat): loss_method = self.loss_fn if self.task_type == consts.CLASSIFICATION: grad_and_hess = self.y.join(y_hat, lambda y, f_val:\ (loss_method.compute_grad(y, loss_method.predict(f_val)),\ loss_method.compute_hess(y, loss_method.predict(f_val)))) else: grad_and_hess = self.y.join( y_hat, lambda y, f_val: (loss_method.compute_grad(y, f_val), loss_method.compute_hess(y, f_val))) return grad_and_hess def compute_local_loss(self, y, y_hat): LOGGER.info('computing local loss') loss_method = self.loss_fn if self.objective_param.objective in [ "lse", "lae", "logcosh", "tweedie", "log_cosh", "huber" ]: # regression tasks y_predict = y_hat else: # classification tasks y_predict = y_hat.mapValues(lambda val: loss_method.predict(val)) loss = loss_method.compute_loss(y, y_predict) return float(loss) @staticmethod def get_subtree_grad_and_hess(g_h, t_idx: int): """ Args: g_h of g_h val t_idx: tree index Returns: grad and hess of sub tree """ LOGGER.info("get grad and hess of tree {}".format(t_idx)) grad_and_hess_subtree = g_h.mapValues(lambda grad_and_hess: ( grad_and_hess[0][t_idx], grad_and_hess[1][t_idx])) return grad_and_hess_subtree def sample_valid_feature(self): if self.feature_num is None: self.feature_num = self.bin_split_points.shape[0] chosen_feature = random.choice(range(0, self.feature_num), \ max(1, int(self.subsample_feature_rate * self.feature_num)), replace=False) valid_features = [False for i in range(self.feature_num)] for fid in chosen_feature: valid_features[fid] = True return valid_features @staticmethod def add_y_hat(f_val, new_f_val, lr=0.1, idx=0): f_val[idx] += lr * new_f_val return f_val def update_y_hat_val(self, new_val=None, mode='train', tree_idx=0): LOGGER.debug( 'update y_hat value, current tree is {}'.format(tree_idx)) add_func = functools.partial(self.add_y_hat, lr=self.learning_rate, idx=tree_idx) if mode == 'train': self.y_hat = self.y_hat.join(new_val, add_func) else: self.y_hat_predict = self.y_hat_predict.join(new_val, add_func) def update_feature_importance(self, tree_feature_importance): for fid in tree_feature_importance: if fid not in self.feature_importance: self.feature_importance[fid] = 0 self.feature_importance[fid] += tree_feature_importance[fid] def sync_feature_num(self): self.transfer_inst.feature_number.remote(self.feature_num, role=consts.ARBITER, idx=-1, suffix=('feat_num', )) def sync_local_loss(self, cur_loss: float, sample_num: int, suffix): data = {'cur_loss': cur_loss, 'sample_num': sample_num} self.transfer_inst.loss_status.remote(data, role=consts.ARBITER, idx=-1, suffix=suffix) LOGGER.debug('loss status sent') def sync_tree_dim(self, tree_dim: int): self.transfer_inst.tree_dim.remote(tree_dim, suffix=('tree_dim', )) LOGGER.debug('tree dim sent') def sync_stop_flag(self, suffix) -> bool: flag = self.transfer_inst.stop_flag.get(idx=0, suffix=suffix) return flag def check_labels( self, data_inst, ) -> List[int]: LOGGER.debug('checking labels') classes_ = None if self.task_type == consts.CLASSIFICATION: num_classes, classes_ = ClassifyLabelChecker.validate_label( data_inst) else: RegressionLabelChecker.validate_label(data_inst) return classes_ def generate_flowid(self, round_num, tree_num): LOGGER.info("generate flowid, flowid {}".format(self.flowid)) return ".".join(map(str, [self.flowid, round_num, tree_num])) def label_alignment(self, labels: List[int]): self.transfer_inst.local_labels.remote(labels, suffix=('label_align', )) def get_valid_features(self, epoch_idx, t_idx): valid_feature = self.transfer_inst.valid_features.get( idx=0, suffix=('valid_features', epoch_idx, t_idx)) return valid_feature def fit( self, data_inst, validate_data=None, ): # binning data_inst = self.data_alignment(data_inst) self.data_bin, self.bin_split_points, self.bin_sparse_points = self.federated_binning( data_inst) # fid mapping self.gen_feature_fid_mapping(data_inst.schema) # set feature_num self.feature_num = self.bin_split_points.shape[0] # sync feature num self.sync_feature_num() # initialize validation strategy self.validation_strategy = self.init_validation_strategy( train_data=data_inst, validate_data=validate_data, ) # check labels local_classes = self.check_labels(self.data_bin) # sync label class and set y if self.task_type == consts.CLASSIFICATION: self.transfer_inst.local_labels.remote(local_classes, role=consts.ARBITER, suffix=('label_align', )) new_label_mapping = self.transfer_inst.label_mapping.get( idx=0, suffix=('label_mapping', )) self.classes_ = [new_label_mapping[k] for k in new_label_mapping] # set labels self.num_classes = len(new_label_mapping) LOGGER.debug('num_classes is {}'.format(self.num_classes)) self.y = self.data_bin.mapValues( lambda instance: new_label_mapping[instance.label]) # set tree dimension self.tree_dim = self.num_classes if self.num_classes > 2 else 1 else: self.y = self.data_bin.mapValues(lambda instance: instance.label) # set loss function self.set_loss_function(self.objective_param) # set y_hat_val self.y_hat, self.init_score = self.loss_fn.initialize(self.y) if self.tree_dim == 1 else \ self.loss_fn.initialize(self.y, self.tree_dim) for epoch_idx in range(self.num_trees): g_h = self.compute_local_grad_and_hess(self.y_hat) for t_idx in range(self.tree_dim): valid_features = self.get_valid_features(epoch_idx, t_idx) LOGGER.debug('valid features are {}'.format(valid_features)) subtree_g_h = self.get_subtree_grad_and_hess(g_h, t_idx) flow_id = self.generate_flowid(epoch_idx, t_idx) new_tree = HomoDecisionTreeClient(self.tree_param, self.data_bin, self.bin_split_points, self.bin_sparse_points, subtree_g_h, valid_feature=valid_features , epoch_idx=epoch_idx, role=self.role, flow_id=flow_id, tree_idx=\ t_idx, mode='train') new_tree.fit() # update y_hat_val self.update_y_hat_val(new_val=new_tree.sample_weights, mode='train', tree_idx=t_idx) self.trees.append(new_tree) self.tree_meta, new_tree_param = new_tree.get_model() self.learnt_tree_param.append(new_tree_param) self.update_feature_importance( new_tree.get_feature_importance()) # sync loss status loss = self.compute_local_loss(self.y, self.y_hat) LOGGER.debug('local loss of epoch {} is {}'.format( epoch_idx, loss)) self.local_loss_history.append(loss) self.aggregator.send_local_loss(loss, self.data_bin.count(), suffix=(epoch_idx, )) # validate if self.validation_strategy: self.validation_strategy.validate(self, epoch_idx) # check stop flag if n_iter_no_change is True if self.n_iter_no_change: should_stop = self.aggregator.get_converge_status( suffix=(str(epoch_idx), )) LOGGER.debug('got stop flag {}'.format(should_stop)) if should_stop: LOGGER.debug('stop triggered') break LOGGER.debug('fitting tree {}/{}'.format(epoch_idx, self.num_trees)) LOGGER.debug('fitting h**o decision tree done') def predict(self, data_inst): to_predict_data = self.data_alignment(data_inst) init_score = self.init_score self.y_hat_predict = data_inst.mapValues(lambda x: init_score) round_num = len(self.learnt_tree_param) // self.tree_dim idx = 0 for round_idx in range(round_num): for tree_idx in range(self.tree_dim): tree_inst = HomoDecisionTreeClient(tree_param=self.tree_param, mode='predict') tree_inst.load_model(model_meta=self.tree_meta, model_param=self.learnt_tree_param[idx]) idx += 1 predict_val = tree_inst.predict(to_predict_data) self.update_y_hat_val(predict_val, mode='predict', tree_idx=tree_idx) predict_result = None if self.task_type == consts.REGRESSION and \ self.objective_param.objective in ["lse", "lae", "huber", "log_cosh", "fair", "tweedie"]: predict_result = to_predict_data.join( self.y_hat_predict, lambda inst, pred: [inst.label, float(pred), float(pred), { "label": float(pred) }]) elif self.task_type == consts.CLASSIFICATION: classes_ = self.classes_ loss_func = self.loss_fn if self.num_classes == 2: predicts = self.y_hat_predict.mapValues( lambda f: float(loss_func.predict(f))) threshold = self.predict_param.threshold predict_result = to_predict_data.join( predicts, lambda inst, pred: [ inst.label, classes_[1] if pred > threshold else classes_[0], pred, { "0": 1 - pred, "1": pred } ]) else: predicts = self.y_hat_predict.mapValues( lambda f: loss_func.predict(f).tolist()) predict_result = to_predict_data.join(predicts, lambda inst, preds: [inst.label,\ classes_[np.argmax(preds)], np.max(preds), dict(zip(map(str, classes_), preds))]) return predict_result def get_feature_importance(self): return self.feature_importance def get_model_meta(self): model_meta = BoostingTreeModelMeta() model_meta.tree_meta.CopyFrom(self.tree_meta) model_meta.learning_rate = self.learning_rate model_meta.num_trees = self.num_trees model_meta.quantile_meta.CopyFrom(QuantileMeta(bin_num=self.bin_num)) model_meta.objective_meta.CopyFrom( ObjectiveMeta(objective=self.objective_param.objective, param=self.objective_param.params)) model_meta.task_type = self.task_type model_meta.n_iter_no_change = self.n_iter_no_change model_meta.tol = self.tol meta_name = "HomoSecureBoostingTreeGuestMeta" return meta_name, model_meta def set_model_meta(self, model_meta): self.tree_meta = model_meta.tree_meta self.learning_rate = model_meta.learning_rate self.num_trees = model_meta.num_trees self.bin_num = model_meta.quantile_meta.bin_num self.objective_param.objective = model_meta.objective_meta.objective self.objective_param.params = list(model_meta.objective_meta.param) self.task_type = model_meta.task_type self.n_iter_no_change = model_meta.n_iter_no_change self.tol = model_meta.tol def get_model_param(self): model_param = BoostingTreeModelParam() model_param.tree_num = len(list(self.learnt_tree_param)) model_param.tree_dim = self.tree_dim model_param.trees_.extend(self.learnt_tree_param) model_param.init_score.extend(self.init_score) model_param.losses.extend(self.local_loss_history) model_param.classes_.extend(map(str, self.classes_)) model_param.num_classes = self.num_classes model_param.best_iteration = -1 feature_importance = list(self.get_feature_importance().items()) feature_importance = sorted(feature_importance, key=itemgetter(1), reverse=True) feature_importance_param = [] for fid, _importance in feature_importance: feature_importance_param.append( FeatureImportanceInfo(sitename=self.role, fid=fid, importance=_importance)) model_param.feature_importances.extend(feature_importance_param) model_param.feature_name_fid_mapping.update( self.feature_name_fid_mapping) param_name = "HomoSecureBoostingTreeGuestParam" return param_name, model_param def get_cur_model(self): meta_name, meta_protobuf = self.get_model_meta() param_name, param_protobuf = self.get_model_param() return {meta_name: meta_protobuf, param_name: param_protobuf} def set_model_param(self, model_param): self.learnt_tree_param = list(model_param.trees_) self.init_score = np.array(list(model_param.init_score)) self.local_loss_history = list(model_param.losses) self.classes_ = list(model_param.classes_) self.tree_dim = model_param.tree_dim self.num_classes = model_param.num_classes self.feature_name_fid_mapping.update( model_param.feature_name_fid_mapping) def get_metrics_param(self): if self.task_type == consts.CLASSIFICATION: if self.num_classes == 2: return EvaluateParam(eval_type="binary", pos_label=self.classes_[1]) else: return EvaluateParam(eval_type="multi") else: return EvaluateParam(eval_type="regression") def export_model(self): if self.need_cv: return None return self.get_cur_model() def load_model(self, model_dict): model_param = None model_meta = None for _, value in model_dict["model"].items(): for model in value: if model.endswith("Meta"): model_meta = value[model] if model.endswith("Param"): model_param = value[model] LOGGER.info("load model") self.set_model_meta(model_meta) self.set_model_param(model_param) self.set_loss_function(self.objective_param) def cross_validation(self, data_instances): if not self.need_run: return data_instances kflod_obj = KFold() cv_param = self._get_cv_param() kflod_obj.run(cv_param, data_instances, self, True) return data_instances