예제 #1
0
    def get_model_param(self):

        model_param = BoostingTreeModelParam()
        model_param.tree_num = len(self.boosting_model_list)
        model_param.tree_dim = self.booster_dim
        model_param.trees_.extend(self.boosting_model_list)
        model_param.init_score.extend(self.init_score)
        model_param.losses.extend(self.history_loss)
        model_param.classes_.extend(map(str, self.classes_))
        model_param.num_classes = self.num_classes
        model_param.model_name = consts.HETERO_SBT
        model_param.best_iteration = -1 if self.validation_strategy is None else self.validation_strategy.best_iteration

        feature_importances = list(self.feature_importances_.items())
        feature_importances = sorted(feature_importances, key=itemgetter(1), reverse=True)
        feature_importance_param = []
        
        for (sitename, fid), importance in feature_importances:
            if consts.GUEST in sitename:
                fullname = self.feature_name_fid_mapping[fid]
            else:
                role_name, party_id = sitename.split(':')
                fullname = generate_anonymous(fid=fid, party_id=party_id, role=role_name)

            feature_importance_param.append(FeatureImportanceInfo(sitename=sitename,
                                                                  fid=fid,
                                                                  importance=importance.importance,
                                                                  fullname=fullname,
                                                                  importance2=importance.importance_2,
                                                                  main=importance.main_type
                                                                  ))
        model_param.feature_importances.extend(feature_importance_param)
        LOGGER.debug('feat importance param {}'.format(feature_importance_param))
        model_param.feature_name_fid_mapping.update(self.feature_name_fid_mapping)

        param_name = consts.HETERO_SBT_GUEST_MODEL + "Param"

        return param_name, model_param
예제 #2
0
    def run(self, data):
        LOGGER.info("Start repeated id processing.")
        id_map_federation = self.transfer_variable.id_map_from_guest
        party_role = consts.HOST
        if self.repeated_id_owner == consts.HOST:
            id_map_federation = self.transfer_variable.id_map_from_host
            party_role = consts.GUEST

        LOGGER.info("repeated_id_owner:{}".format(self.repeated_id_owner))

        original_schema = data.schema
        if self.repeated_id_owner == self.role:
            id_map = self.__generate_id_map(data)
            LOGGER.info("finish generate id_map, id_map:{}".format(id_map))

            id_map_federation.remote(id_map,
                                     role=party_role,
                                     idx=-1)

            one_feature = data.first()
            if isinstance(one_feature[1], Instance):
                data = data.mapValues(
                    lambda v: Instance(features=np.array(v.features[1:], dtype=np.float), label=v.label,
                                       inst_id=v.inst_id, weight=v.weight))
            else:
                data = data.mapValues(lambda v: v[1:])
            data.schema = original_schema
            if data.schema.get('header') is not None:
                data.schema['header'] = data.schema['header'][1:]
        else:
            id_map = id_map_federation.get(idx=0)
            LOGGER.info("Get id_map from owner.")
            data = data.flatMap(functools.partial(self.__func_restructure_id, id_map=id_map))
            data.schema = original_schema

        LOGGER.info("Finish repeated id process for owner")

        return data
예제 #3
0
    def convert(self, model_meta, model_param):
        local_vif = model_param.local_vif
        col_names = list(model_param.names)
        local_corr = np.array(model_param.local_corr).reshape(
            model_param.shape, model_param.shape)

        from federatedml.util import LOGGER
        for idx in range(local_corr.shape[0]):
            corr_col = local_corr[idx, :]
            LOGGER.debug(f"local_col_idx: {idx}, corr_col: {corr_col}")

        if model_param.corr:
            corr = np.array(model_param.corr).reshape(*model_param.shapes)

            for idx in range(corr.shape[1]):
                corr_col = corr[:, idx]
                LOGGER.debug(f"col_idx: {idx}, corr_col: {corr_col}")

            host_names = list(list(model_param.all_names)[1].names)
            parties = list(model_param.parties)
        else:
            corr = None
            host_names = None
            parties = None
        pearson_metric = PearsonMetricInfo(local_corr=local_corr,
                                           col_names=col_names,
                                           corr=corr,
                                           host_col_names=host_names,
                                           parties=parties)

        single_info = isometric_model.SingleMetricInfo(values=local_vif,
                                                       col_names=col_names)
        result = isometric_model.IsometricModel()
        result.add_metric_value(metric_name=consts.VIF,
                                metric_info=single_info)
        result.add_metric_value(metric_name=consts.PEARSON,
                                metric_info=pearson_metric)
        return result
예제 #4
0
 def transform_data_label(data, label_encoder):
     data_type = data.schema.get("content_type")
     if data_type == "cluster_result":
         return data.mapValues(
             lambda v: LabelTransformer.replace_predict_label_cluster(
                 v, label_encoder))
     elif data_type == "predict_result":
         predict_detail = data.first()[1].features[3]
         if predict_detail == 1 and list(
                 predict_detail.keys())[0] == "label":
             LOGGER.info(
                 f"Regression prediction result provided. Original data returned."
             )
             return data
         return data.mapValues(lambda v: LabelTransformer.
                               replace_predict_label(v, label_encoder))
     elif data_type is None:
         return data.mapValues(lambda v: LabelTransformer.
                               replace_instance_label(v, label_encoder))
     else:
         raise ValueError(
             f"unknown data type: {data_type} encountered. Label transform aborted."
         )
예제 #5
0
    def backward(self, output_gradient, epoch, batch):
        LOGGER.debug(
            "interactive layer start backward propagation of epoch {} batch {}"
            .format(epoch, batch))
        activation_backward = self.host_model.backward_activation()[0]

        activation_gradient = output_gradient * activation_backward

        LOGGER.debug(
            "interactive layer update guest weight of epoch {} batch {}".
            format(epoch, batch))
        guest_input_gradient = self.update_guest(activation_gradient)

        host_weight_gradient, acc_noise = self.backward_interactive(
            activation_gradient, epoch, batch)

        host_input_gradient = self.update_host(activation_gradient,
                                               host_weight_gradient, acc_noise)

        self.send_host_backward_to_host(host_input_gradient.get_obj(), epoch,
                                        batch)

        return guest_input_gradient
예제 #6
0
    def mini_batch_data_generator(self, result='data'):
        """
        Generate mini-batch data or index

        Parameters
        ----------
        result : str, 'data' or 'index', default: 'data'
            Specify you want batch data or batch index.

        Returns
        -------
        A generator that might generate data or index.
        """
        LOGGER.debug("Currently, batch_num is: {}".format(self.batch_nums))
        if result == 'index':
            for index_table in self.all_index_data:
                yield index_table
        elif result == "data":
            for batch_data in self.all_batch_data:
                yield batch_data
        else:
            for batch_data, index_table in zip(self.all_batch_data, self.all_index_data):
                yield batch_data, index_table
예제 #7
0
파일: sir_param.py 프로젝트: yubo1993/FATE
 def check(self):
     descr = "secure information retrieval param's "
     self.check_decimal_float(self.security_level, descr + "security_level")
     self.oblivious_transfer_protocol = self.check_and_change_lower(self.oblivious_transfer_protocol,
                                                                    [consts.OT_HAUCK.lower()],
                                                                    descr + "oblivious_transfer_protocol")
     self.commutative_encryption = self.check_and_change_lower(self.commutative_encryption,
                                                               [consts.CE_PH.lower()],
                                                               descr + "commutative_encryption")
     self.non_committing_encryption = self.check_and_change_lower(self.non_committing_encryption,
                                                                  [consts.AES.lower()],
                                                                  descr + "non_committing_encryption")
     if self._warn_to_deprecate_param("key_size", descr, "dh_param's key_length"):
         self.dh_params.key_length = self.key_size
     self.dh_params.check()
     if self._warn_to_deprecate_param("raw_retrieval", descr, "dh_param's security_level = 0"):
         self.check_boolean(self.raw_retrieval, descr)
     if not isinstance(self.target_cols, list):
         self.target_cols = [self.target_cols]
     for col in self.target_cols:
         self.check_string(col, descr + "target_cols")
     if len(self.target_cols) == 0:
         LOGGER.warning(f"Both 'target_cols' and 'target_indexes' are empty. Label will be retrieved.")
예제 #8
0
    def __generate_id_map(self, data) -> dict:
        if not self.repeated_id_owner:
            LOGGER.warning("Not a repeated id owner, will not generate id map")
            return {}

        one_feature = data.first()
        if isinstance(one_feature[1], Instance):
            data = data.mapValues(lambda v: v.features[0])
        else:
            data = data.mapValues(lambda v: v[0])

        local_data = data.collect()
        all_id_map = defaultdict(list)
        final_id_map = {}

        for _data in local_data:
            all_id_map[str(_data[1])].append(_data[0])

        for k, v in all_id_map.items():
            if len(v) >= 2:
                final_id_map[k] = v

        return final_id_map
예제 #9
0
    def _func(*args, **kwargs):
        input_with_inst_id = None
        all_args = []
        all_args.extend(args)
        all_args.extend(kwargs.values())
        for arg in all_args:
            if is_table(arg):
                input_with_inst_id = check_with_inst_id(arg)
                break

        result = func(*args, **kwargs)

        if input_with_inst_id is not None and is_table(result):
            if check_is_instance(result):
                result_with_inst_id = check_with_inst_id(result)
                LOGGER.debug(
                    f"Input with match id: {input_with_inst_id} -> output with match id: {result_with_inst_id}"
                )
                if input_with_inst_id and not result_with_inst_id:
                    raise EnvironmentError(
                        f"Input with match id: {input_with_inst_id} -> output with match id: {result_with_inst_id},"
                        f"func: {func}")
        return result
예제 #10
0
    def get_intersect_doubly_encrypted_id(self, data_instances):
        self._sync_commutative_cipher_public_knowledge()
        self.commutative_cipher.init()

        # 1st ID encrypt: (Eh, (h, Instance))
        self.id_list_local_first = self._encrypt_id(
            data_instances,
            self.commutative_cipher,
            reserve_original_key=True,
            hash_operator=self.hash_operator,
            salt=self.salt,
            reserve_original_value=True)
        LOGGER.info("encrypted local id for the 1st time")
        # send (Eh, -1), get (Eg, -1)
        id_list_remote_first = self._exchange_id_list(self.id_list_local_first)

        # 2nd ID encrypt & send doubly encrypted guest ID list to guest
        id_list_remote_second = self._encrypt_id(
            id_list_remote_first,
            self.commutative_cipher,
            reserve_original_key=True)  # (EEg, Eg)
        LOGGER.info("encrypted guest id for the 2nd time")
        self._sync_doubly_encrypted_id_list(id_list_remote_second)
예제 #11
0
    def decrypt_intersect_doubly_encrypted_id(self,
                                              id_list_intersect_cipher_cipher):
        # EEi -> Ei from Eg
        id_list_intersect_cipher = self.get_intersect_cipher(
            id_list_intersect_cipher_cipher)

        # find intersect ids: (Ei, original key)
        encrypt_intersect_ids = [
            self.extract_intersect_ids(id_list_intersect_cipher[i],
                                       self.id_list_local_first[i])
            for i in range(len(self.id_list_local_first))
        ]
        # map encrypted intersect ids to original ids
        intersect_ids = self.filter_intersect_ids(encrypt_intersect_ids,
                                                  keep_encrypt_ids=True)
        LOGGER.info(f"intersection found")

        if self.sync_intersect_ids:
            self.send_intersect_ids(intersect_ids)
        else:
            LOGGER.info("Skip sync intersect ids with Host(s).")

        return intersect_ids
예제 #12
0
    def _init_model(self, boosting_param: BoostingParam):

        self.task_type = boosting_param.task_type
        self.objective_param = boosting_param.objective_param
        self.learning_rate = boosting_param.learning_rate
        self.boosting_round = boosting_param.num_trees
        self.n_iter_no_change = boosting_param.n_iter_no_change
        self.tol = boosting_param.tol
        self.bin_num = boosting_param.bin_num
        self.predict_param = boosting_param.predict_param
        self.cv_param = boosting_param.cv_param
        self.validation_freqs = boosting_param.validation_freqs
        self.metrics = boosting_param.metrics
        self.subsample_feature_rate = boosting_param.subsample_feature_rate
        self.binning_error = boosting_param.binning_error

        if boosting_param.random_seed is not None:
            self.random_seed = boosting_param.random_seed

        # initialize random seed here
        LOGGER.debug('setting random seed done, random seed is {}'.format(
            self.random_seed))
        np.random.seed(self.random_seed)
예제 #13
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(
            self.role, self.component_properties))
        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)
예제 #14
0
    def compute_best_splits(self, cur_to_split_nodes, node_map, dep, batch_idx):

        acc_histograms = self.get_local_histograms(dep, self.data_with_node_assignments, self.grad_and_hess,
                                                   None, cur_to_split_nodes, node_map, ret='tensor',
                                                   hist_sub=False)

        best_split_info_guest = self.splitter.find_split(acc_histograms, self.valid_features,
                                                         self.data_bin.partitions, self.sitename,
                                                         self.use_missing, self.zero_as_missing)
        LOGGER.debug('computing local splits done')

        if self.complete_secure_tree:
            return best_split_info_guest

        self.federated_find_split(dep, batch_idx)
        host_split_info = self.sync_final_split_host(dep, batch_idx)

        # compare host best split points with guest split points
        cur_best_split = self.merge_splitinfo(splitinfo_guest=best_split_info_guest,
                                              splitinfo_host=host_split_info,
                                              merge_host_split_only=False)

        return cur_best_split
예제 #15
0
    def convert_bin_to_real(self):
        LOGGER.info("convert tree node bins to real value")
        split_nid_used = []
        for i in range(len(self.tree_node)):
            if self.tree_node[i].is_leaf is True:
                continue

            if self.tree_node[i].sitename == self.sitename:
                fid = self.decode("feature_idx",
                                  self.tree_node[i].fid,
                                  split_maskdict=self.split_maskdict)
                bid = self.decode("feature_val", self.tree_node[i].bid,
                                  self.tree_node[i].id, self.split_maskdict)
                LOGGER.debug("shape of bin_split_points is {}".format(
                    len(self.bin_split_points[fid])))
                real_splitval = self.encode("feature_val",
                                            self.bin_split_points[fid][bid],
                                            self.tree_node[i].id)
                self.tree_node[i].bid = real_splitval

                split_nid_used.append(self.tree_node[i].id)

        self.remove_duplicated_split_nodes(split_nid_used)
예제 #16
0
    def _get_param(self):
        header = self.header
        LOGGER.debug("In get_param, header: {}".format(header))
        if header is None:
            param_protobuf_obj = poisson_model_param_pb2.PoissonModelParam(
                best_iteration=-1)
            return param_protobuf_obj

        weight_dict = {}
        for idx, header_name in enumerate(header):
            coef_i = self.model_weights.coef_[idx]
            weight_dict[header_name] = coef_i
        intercept_ = self.model_weights.intercept_
        best_iteration = -1 if self.validation_strategy is None else self.validation_strategy.best_iteration
        param_protobuf_obj = poisson_model_param_pb2.PoissonModelParam(
            iters=self.n_iter_,
            loss_history=self.loss_history,
            is_converged=self.is_converged,
            weight=weight_dict,
            intercept=intercept_,
            header=header,
            best_iteration=best_iteration)
        return param_protobuf_obj
예제 #17
0
def get_default_target_framework(model_contents: dict, module_name: str):
    """
    Returns the name of a supported ML framework based on the
    original FATE model module name and model contents.

    :param model_contents: the model content of the FATE model
    :param module_name:  The module name, typically as HomoXXXX.
    :return: the corresponding framework name that this model can be converted to.
    """
    framework_name = None
    if module_name == "HomoLR":
        framework_name = "sklearn"
    elif module_name == 'HomoNN':
        if model_contents['HomoNNModelMeta'].params.config_type == "pytorch":
            framework_name = "pytorch"
        else:
            framework_name = "tf_keras"
    elif module_name.lower() == 'homosecureboost':
        framework_name = 'lightgbm'
    else:
        LOGGER.debug(
            f"Module {module_name} is not a supported homogeneous model")
    return framework_name
예제 #18
0
    def predict(self, data_inst):

        LOGGER.info('running prediction')

        processed_data = self.data_and_header_alignment(data_inst)

        predict_start_round = self.sync_predict_start_round()

        rounds = len(self.boosting_model_list) // self.booster_dim
        trees = []
        for idx in range(predict_start_round, rounds):
            for booster_idx in range(self.booster_dim):
                tree = self.load_booster(
                    self.booster_meta,
                    self.boosting_model_list[idx * self.booster_dim +
                                             booster_idx], idx, booster_idx)
                trees.append(tree)

    #    if len(trees) == 0:
    #       LOGGER.info('no tree for predicting, prediction done')
    #      return

        self.boosting_fast_predict(processed_data, trees=trees)
예제 #19
0
    def display_cv_result(self, cv_results):
        LOGGER.debug("cv_result: {}".format(cv_results))
        if self.role == consts.GUEST or (self.role == consts.HOST
                                         and self.mode == consts.H**O):
            format_cv_result = {}
            for eval_result in cv_results:
                for eval_name, eval_r in eval_result.items():
                    if not isinstance(eval_r, list):
                        if eval_name not in format_cv_result:
                            format_cv_result[eval_name] = []
                        format_cv_result[eval_name].append(eval_r)
                    else:
                        for e_r in eval_r:
                            e_name = "{}_thres_{}".format(eval_name, e_r[0])
                            if e_name not in format_cv_result:
                                format_cv_result[e_name] = []
                            format_cv_result[e_name].append(e_r[1])

            for eval_name, eva_result_list in format_cv_result.items():
                mean_value = np.around(np.mean(eva_result_list), 4)
                std_value = np.around(np.std(eva_result_list), 4)
                LOGGER.info("{},evaluate name: {}, mean: {}, std: {}".format(
                    self.role, eval_name, mean_value, std_value))
예제 #20
0
    def fast_homo_tree_predict(self, data_inst):

        LOGGER.info('running fast h**o tree predict')
        to_predict_data = self.data_and_header_alignment(data_inst)
        tree_list = []
        rounds = len(self.boosting_model_list) // self.booster_dim
        for idx in range(0, rounds):
            for booster_idx in range(self.booster_dim):
                model = self.load_booster(
                    self.booster_meta,
                    self.boosting_model_list[idx * self.booster_dim +
                                             booster_idx], idx, booster_idx)
                tree_list.append(model)

        func = functools.partial(self.predict_helper,
                                 tree_list=tree_list,
                                 init_score=self.init_score,
                                 zero_as_missing=self.zero_as_missing,
                                 use_missing=self.use_missing,
                                 learning_rate=self.learning_rate,
                                 class_num=self.booster_dim)
        predict_rs = to_predict_data.mapValues(func)
        return self.score_to_predict_result(data_inst, predict_rs)
예제 #21
0
    def compute_gradient_procedure(self, *args):
        data_instances = args[0]
        encrypted_calculator = args[1]
        model_weights = args[2]
        optimizer = args[3]
        self.batch_index = args[5]
        self.n_iter = args[4]
        cipher_operator = encrypted_calculator[0].encrypter
        # one_data = data_instances.first()
        # LOGGER.debug("data shape: {}, model weights shape: {}, model weights coef: {}, intercept: {}".format(
        #     one_data[1].features.shape, model_weights.unboxed.shape, model_weights.coef_, model_weights.intercept_
        # ))

        gradient_results = self.gradient_computer.compute_gradient_procedure(
            *args)
        self._update_w_tilde(model_weights)

        if self.iter_k % self.update_interval_L == 0:
            self.count_t += 1
            # LOGGER.debug("Before division, this_w_tilde: {}".format(self.this_w_tilde.unboxed))
            self.this_w_tilde /= self.update_interval_L
            # LOGGER.debug("After division, this_w_tilde: {}".format(self.this_w_tilde.unboxed))

            if self.count_t > 0:
                LOGGER.info(
                    "iter_k: {}, count_t: {}, start to update hessian".format(
                        self.iter_k, self.count_t))
                self._update_hessian(data_instances, optimizer,
                                     cipher_operator)
            self.last_w_tilde = self.this_w_tilde
            self.this_w_tilde = LinearModelWeights(
                np.zeros_like(self.last_w_tilde.unboxed),
                self.last_w_tilde.fit_intercept)
            # LOGGER.debug("After replace, last_w_tilde: {}, this_w_tilde: {}".format(self.last_w_tilde.unboxed,
            #                                                                         self.this_w_tilde.unboxed))

        return gradient_results
예제 #22
0
파일: evaluation.py 프로젝트: yubo1993/FATE
    def _evaluate_clustering_metrics(self, mode, data):

        eval_result = defaultdict(list)
        rs0, rs1, run_outer_metric = self._clustering_extract(data)
        if rs0 is None and rs1 is None:  # skip evaluation computation if get this input format
            LOGGER.debug(
                'skip computing, this clustering format is not for metric computation'
            )
            return eval_result

        if not run_outer_metric:
            no_label = set(rs0) == {None}
            if no_label:
                LOGGER.debug(
                    'no label found in clustering result, skip metric computation'
                )
                return eval_result

        for eval_metric in self.metrics:

            # if input format and required metrics matches ? XNOR
            if not ((not (eval_metric in self.clustering_intra_metric_list)
                     and not run_outer_metric) +
                    ((eval_metric in self.clustering_intra_metric_list)
                     and run_outer_metric)):
                LOGGER.warning(
                    'input data format does not match current clustering metric: {}'
                    .format(eval_metric))
                continue

            LOGGER.debug('clustering_metrics is {}'.format(eval_metric))

            if run_outer_metric:

                if eval_metric == consts.DISTANCE_MEASURE:
                    res = getattr(self.metric_interface,
                                  eval_metric)(rs0['avg_dist'], rs1,
                                               rs0['max_radius'])
                else:
                    res = getattr(self.metric_interface,
                                  eval_metric)(rs0['avg_dist'], rs1)
            else:
                res = getattr(self.metric_interface, eval_metric)(rs0, rs1)
            eval_result[eval_metric].append(mode)
            eval_result[eval_metric].append(res)

        return eval_result
예제 #23
0
    def train_and_get_backward_gradient(self, x, y):
        LOGGER.debug("top model start to forward propagation")

        selective_id = []
        input_gradient = []
        if self.selector:
            losses = self._model.get_forward_loss_from_input(x, y)
            loss = sum(losses) / len(losses)
            selective_strategy = self.selector.select_batch_sample(losses)
            for idx, select in enumerate(selective_strategy):
                if select:
                    selective_id.append(idx)
                    self.batch_data_cached_X.append(x[idx])
                    self.batch_data_cached_y.append(y[idx])

            if len(self.batch_data_cached_X) >= self.batch_size:
                data = self.data_converter.convert_data(
                    np.array(self.batch_data_cached_X[:self.batch_size]),
                    np.array(self.batch_data_cached_y[:self.batch_size]))
                input_gradient = self._model.get_input_gradients(
                    np.array(self.batch_data_cached_X[:self.batch_size]),
                    np.array(self.batch_data_cached_y[:self.batch_size]))[0]

                self._model.train(data)

                self.batch_data_cached_X = self.batch_data_cached_X[
                    self.batch_size:]
                self.batch_data_cached_y = self.batch_data_cached_y[
                    self.batch_size:]

        else:
            input_gradient = self._model.get_input_gradients(x, y)[0]
            data = self.data_converter.convert_data(x, y)
            self._model.train(data)
            loss = self._model.get_loss()[0]

        return selective_id, input_gradient, loss
예제 #24
0
    def fit(self, data_inst, validate_data=None):
        self.validation_strategy = self.init_validation_strategy(
            data_inst, validate_data)
        self._build_model()
        self.prepare_batch_data(self.batch_generator, data_inst)

        cur_epoch = 0

        while cur_epoch < self.epochs:
            for batch_idx in range(len(self.data_x)):
                self.model.train(self.data_x[batch_idx], cur_epoch, batch_idx)

                self.reset_flowid()
                self.model.evaluate(self.data_x[batch_idx], cur_epoch,
                                    batch_idx)
                self.recovery_flowid()

            if self.validation_strategy:
                self.validation_strategy.validate(self, cur_epoch)
                if self.validation_strategy.need_stop():
                    LOGGER.debug('early stopping triggered')
                    break

            is_converge = self.transfer_variable.is_converge.get(
                idx=0, suffix=(cur_epoch, ))

            if is_converge:
                LOGGER.debug(
                    "Training process is converged in epoch {}".format(
                        cur_epoch))
                break

            cur_epoch += 1

        if self.validation_strategy and self.validation_strategy.has_saved_best_model(
        ):
            self.load_model(self.validation_strategy.cur_best_model)
예제 #25
0
 def transform(self, data):
     LOGGER.info(f"Enter Column Expand transform")
     if self.method == consts.MANUAL and len(self.append_header) == 0:
         LOGGER.info(
             f"Finish Column Expand transform. Original data returned.")
         return data
     new_data, self.header = self._append_column(data)
     LOGGER.info(f"Finish Column Expand transform")
     return new_data
예제 #26
0
    def compute_loss(self,
                     data_instances,
                     n_iter_,
                     batch_index,
                     loss_norm=None):
        '''
        Compute hetero linr loss:
            loss = (1/N)*\sum(wx-y)^2 where y is label, w is model weight and x is features
        log(wx - y)^2 = (wx_h)^2 + (wx_g - y)^2 + 2*(wx_h + wx_g - y)
        '''
        current_suffix = (n_iter_, batch_index)
        n = data_instances.count()
        loss_list = []
        host_wx_squares = self.get_host_loss_intermediate(current_suffix)

        if loss_norm is not None:
            host_loss_regular = self.get_host_loss_regular(
                suffix=current_suffix)
        else:
            host_loss_regular = []
        if len(self.host_forwards) > 1:
            LOGGER.info("More than one host exist, loss is not available")
        else:
            host_forward = self.host_forwards[0]
            host_wx_square = host_wx_squares[0]

            wxy_square = self.half_d.mapValues(lambda x: np.square(x)).reduce(
                reduce_add)

            loss_gh = self.half_d.join(host_forward,
                                       lambda g, h: g * h).reduce(reduce_add)
            loss = (wxy_square + host_wx_square + 2 * loss_gh) / (2 * n)
            if loss_norm is not None:
                loss = loss + loss_norm + host_loss_regular[0]
            loss_list.append(loss)
        # LOGGER.debug("In compute_loss, loss list are: {}".format(loss_list))
        self.sync_loss_info(loss_list, suffix=current_suffix)
예제 #27
0
    def fit(self, data_inst):
        LOGGER.debug(f"Enter Hetero {self.role} Data Split fit")
        if self.need_run is False:
            return
        self.param_validator(data_inst)

        ids = self._get_ids(data_inst)
        y = self._get_y(data_inst)

        id_train, id_test_validate, y_train, y_test_validate = self._split(
            ids,
            y,
            test_size=self.test_size + self.validate_size,
            train_size=self.train_size)

        validate_size, test_size = DataSplitter.get_train_test_size(
            self.validate_size, self.test_size)
        id_validate, id_test, y_validate, y_test = self._split(
            id_test_validate,
            y_test_validate,
            test_size=test_size,
            train_size=validate_size)

        train_data, validate_data, test_data = self.split_data(
            data_inst, id_train, id_validate, id_test)

        all_metas = {}

        all_metas = self.callback_count_info(id_train, id_validate, id_test,
                                             all_metas)
        if self.stratified:
            all_metas = self.callback_label_info(y_train, y_validate, y_test,
                                                 all_metas)
        self.callback(all_metas)
        self.set_summary(all_metas)

        return [train_data, validate_data, test_data]
예제 #28
0
    def predict(self, data_instances):
        """
        Prediction of Poisson
        Parameters
        ----------
        data_instances: Table of Instance, input data

        Returns
        ----------
        Table
            include input data label, predict results
        """
        LOGGER.info("Start predict ...")

        self._abnormal_detection(data_instances)
        header = data_instances.schema.get("header")
        self.exposure_index = self.get_exposure_index(header,
                                                      self.exposure_colname)
        exposure_index = self.exposure_index

        # OK
        exposure = data_instances.mapValues(
            lambda v: HeteroPoissonBase.load_exposure(v, exposure_index))

        data_instances = self.align_data_header(data_instances, self.header)

        pred_guest = self.compute_mu(data_instances, self.model_weights.coef_,
                                     self.model_weights.intercept_, exposure)
        pred_host = self.transfer_variable.host_partial_prediction.get(idx=0)

        LOGGER.info("Get prediction from Host")

        pred = pred_guest.join(pred_host, lambda g, h: g * h)
        # predict_result = data_instances.join(pred, lambda d, p: [d.label, p, p, {"label": p}])
        predict_result = self.predict_score_to_output(
            data_instances=data_instances, predict_score=pred, classes=None)
        return predict_result
예제 #29
0
    def check(self):
        descr = "intersect preprocess param's false_positive_rate "
        self.check_decimal_float(self.false_positive_rate, descr)
        self.check_positive_number(self.false_positive_rate, descr)
        if self.false_positive_rate > 0.5:
            raise ValueError(
                f"{descr} must be positive float no greater than 0.5")

        descr = "intersect preprocess param's encrypt_method "
        self.encrypt_method = self.check_and_change_lower(
            self.encrypt_method, [consts.RSA], descr)

        descr = "intersect preprocess param's random_state "
        if self.random_state:
            self.check_nonnegative_number(self.random_state, descr)

        descr = "intersect preprocess param's hash_method "
        self.hash_method = self.check_and_change_lower(self.hash_method, [
            consts.MD5, consts.SHA1, consts.SHA224, consts.SHA256,
            consts.SHA384, consts.SHA512, consts.SM3
        ], descr)
        descr = "intersect preprocess param's preprocess_salt "
        self.check_string(self.preprocess_salt, descr)

        descr = "intersect preprocess param's preprocess_method "
        self.preprocess_method = self.check_and_change_lower(
            self.preprocess_method, [
                consts.MD5, consts.SHA1, consts.SHA224, consts.SHA256,
                consts.SHA384, consts.SHA512, consts.SM3
            ], descr)

        descr = "intersect preprocess param's filter_owner "
        self.filter_owner = self.check_and_change_lower(
            self.filter_owner, [consts.GUEST, consts.HOST], descr)

        LOGGER.debug("Finish IntersectPreProcessParam parameter check!")
        return True
예제 #30
0
    def load_model(self, model_dict):
        LOGGER.debug(f"Start to load model")
        if 'model' in model_dict:
            LOGGER.debug("Loading selection model")
            self._load_selection_model(model_dict)

        if 'isometric_model' in model_dict:
            LOGGER.debug("Loading isometric_model")
            self._load_isometric_model(model_dict['isometric_model'])