예제 #1
0
    def __get_min_max_value(self, data):
        min_value = None
        max_value = None
        summary_obj = MultivariateStatisticalSummary(data, -1)

        if self.feat_upper is not None:
            max_value = self.feat_upper

        if self.feat_lower is not None:
            min_value = self.feat_lower

        if min_value is None and max_value is not None:
            min_value_list = summary_obj.get_min()

            if isinstance(max_value, Iterable):
                if len(list(max_value)) != len(min_value_list):
                    raise ValueError(
                        "Size of feat_upper is not equal to column of data, {} != {}"
                        .format(len(list(max_value)), len(min_value_list)))
                max_value_list = max_value
            else:
                max_value_list = [max_value for v in min_value_list]

        elif min_value is not None and max_value is None:
            max_value_list = summary_obj.get_max()

            if isinstance(min_value, Iterable):
                if len(list(min_value)) != len(max_value_list):
                    raise ValueError(
                        "Size of feat_lower is not equal to column of data, {} != {}"
                        .format(len(list(max_value)), len(max_value_list)))
                min_value_list = min_value
            else:
                min_value_list = [min_value for v in max_value_list]

        elif min_value is None and max_value is None:
            min_value_list = summary_obj.get_min()
            max_value_list = summary_obj.get_max()
        else:
            shape = None
            if isinstance(max_value, Iterable):
                max_value_list = max_value
            else:
                shape = data_overview.get_features_shape(data)
                max_value_list = [max_value for _ in range(shape)]

            if isinstance(min_value, Iterable):
                min_value_list = min_value
            else:
                if not shape:
                    shape = data_overview.get_features_shape(data)

                min_value_list = [min_value for _ in range(shape)]

            if len(list(max_value_list)) != len(min_value_list):
                raise ValueError(
                    "Size of feat_upper is not equal to column of data, {} != {}"
                    .format(len(list(max_value)), len(min_value_list)))

        return min_value_list, max_value_list
예제 #2
0
 def _parse_cols(self, data_instances):
     if self.cols == -1:
         features_shape = get_features_shape(data_instances)
         if features_shape is None:
             raise RuntimeError(
                 'Cannot get feature shape, please check input data')
         self.cols = [i for i in range(features_shape)]
예제 #3
0
    def cal_local_iv(self,
                     data_instances,
                     cols,
                     split_points=None,
                     label_table=None):
        if cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            cols = [i for i in range(features_shape)]

        if split_points is None:
            split_points = self.binning(data_instances, cols=cols)

        data_bin_table = self.transform(data_instances, split_points, cols)
        if label_table is None:
            label_table = data_instances.mapValues(lambda x: x.label)
        event_count_table = label_table.mapValues(lambda x: (x, 1 - x))
        data_bin_with_label = data_bin_table.join(event_count_table,
                                                  lambda x, y: (x, y))
        f = functools.partial(self.add_label_in_partition,
                              total_bin=self.bin_num,
                              cols=cols)

        result_sum = data_bin_with_label.mapPartitions(f)
        result_counts = result_sum.reduce(self.aggregate_partition_label)

        iv_attrs = self.cal_iv_woe(result_counts,
                                   self.params.adjustment_factor,
                                   split_points=split_points)
        return iv_attrs
예제 #4
0
    def fit(self, data):
        if not self.with_mean and not self.with_std:
            shape = data_overview.get_features_shape(data)
            mean = [0 for _ in range(shape)]
            std = [1 for _ in range(shape)]
            return data, mean, std

        else:
            summary_obj = MultivariateStatisticalSummary(data, -1)
            mean = None
            std = None

            if self.with_mean:
                mean = summary_obj.get_mean()

            if self.with_std:
                std = summary_obj.get_std_variance()

            if not mean and std:
                mean = [0 for value in std]
            elif mean and not std:
                std = [1 for value in mean]

            if not mean or not std:
                raise ValueError("mean or std is None")

            f = functools.partial(self.__scale, mean=mean, std=std)
            data = data.mapValues(f)

            return data, mean, std
예제 #5
0
    def _init_model_variables(self, data_instances):
        model_shape = data_overview.get_features_shape(data_instances)

        LOGGER.info("Initialized model shape is {}".format(model_shape))

        model_weights = self.initializer.init_model(model_shape, init_params=self.init_param_obj,
                                                 data_instance=data_instances)
        return model_weights
예제 #6
0
    def filter(self, data_instances):
        if self.select_cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.select_cols = [i for i in range(features_shape)]
        self.left_cols = self.select_cols.copy()

        for method in self.filter_methods:
            self.filter_one_method(data_instances, method)
        return self.left_cols
예제 #7
0
 def initialize(self, data_instances):
     """
     对w和偏置b进行初始化
     """
     data_shape = data_overview.get_features_shape(data_instances)
     LOGGER.info("除去偏置b,数据的维度属性是:{}".format(data_shape))
     # 将偏置b也加进来
     if isinstance(data_shape, int):
         data_shape += 1
     # 初始化模型参数
     self.w = np.random.rand(data_shape)
     LOGGER.info("初始化模型参数self.w是:{}".format(self.w))
예제 #8
0
    def _transfer_data(self, data_instances):
        if self.left_cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.left_cols = [i for i in range(features_shape)]

        f = functools.partial(self.select_cols, left_cols=self.left_cols)

        new_data = data_instances.mapValues(f)
        self._reset_header()
        return new_data
예제 #9
0
    def __init__(self, data_instances, select_cols):
        self.finish_fit = False
        self.summary_statistics = []
        self.median = None
        self.data_instances = data_instances

        if select_cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            select_cols = [i for i in range(features_shape)]

        self.select_cols = select_cols
예제 #10
0
    def transform(self, data_instances, split_points=None, cols=-1):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        split_points : list.
            Each row represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = [[0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        cols : int or list of int
            Specify which column(s) need to apply binning. -1 means do binning for all columns.

        Returns
        -------
        data_bin_table : DTable.
            The element in each row represent for the corresponding bin number this feature belongs to.
            e.g. for each row, it could be:
            (1, 5, 2, 6, 0, ...)    # Each number represent for the bin number it belongs to. The order is the
                                # same as the order of cols.


        """
        if cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            cols = [i for i in range(features_shape)]

        if isinstance(cols, int):
            cols = [cols]

        assert len(split_points) == len(cols)

        if split_points is None:
            split_points = self.binning(data_instances, cols)

        f = functools.partial(self.bin_data,
                              split_points=split_points,
                              cols=cols)
        data_bin_table = data_instances.mapValues(f)
        return data_bin_table
예제 #11
0
 def client_sync_data_info(self, data):
     n, j = data.count(), data_overview.get_features_shape(data)
     self.n_count = n
     if self.role == consts.HOST:
         self.transfer_variable.host_data_info.remote((n, j), role=consts.ARBITER, idx=0)
         self.transfer_variable.host_data_info.remote((n, j), role=consts.GUEST, idx=0)
         j_host = j
         n_guest, j_guest = self.transfer_variable.guest_data_info.get(idx=0)
     else:
         self.transfer_variable.guest_data_info.remote((n, j), role=consts.ARBITER, idx=0)
         self.transfer_variable.guest_data_info.remote((n, j), role=consts.HOST, idx=0)
         j_guest = j
         n_host, j_host = self.transfer_variable.host_data_info.get(idx=0)
     return j_host, j_guest
예제 #12
0
    def __init_model(self, data_instances):
        model_shape = data_overview.get_features_shape(data_instances)
        w = self.initializer.init_model(model_shape,
                                        init_params=self.init_param_obj)

        w = self.encrypt_operator.encrypt_list(w)
        w = np.array(w)

        if self.fit_intercept:
            self.coef_ = w[:-1]
            self.intercept_ = w[-1]
        else:
            self.coef_ = w
            self.intercept_ = 0
        return w
예제 #13
0
    def _init_model_variables(self, data_instances):
        model_shape = data_overview.get_features_shape(data_instances)

        LOGGER.info("Initialized model shape is {}".format(model_shape))

        fit_intercept = False
        if self.init_param_obj.fit_intercept:
            fit_intercept = True
            self.init_param_obj.fit_intercept = False
        w_ = self.initializer.init_model(model_shape, init_params=self.init_param_obj)
        embed_ = self.initializer.init_model([model_shape, self.init_param_obj.embed_size],
                                             init_params=self.init_param_obj)
        model_weights = \
            FactorizationMachineWeights(w_, embed_, fit_intercept=fit_intercept)
        return model_weights
예제 #14
0
    def __init_model(self, data_instances):
        model_shape = data_overview.get_features_shape(data_instances)

        LOGGER.info("Initialized model shape is {}".format(model_shape))

        w = self.initializer.init_model(model_shape,
                                        init_params=self.init_param_obj)
        if self.fit_intercept:
            self.coef_ = w[:-1]
            self.intercept_ = w[-1]
        else:
            self.coef_ = w
            self.intercept_ = 0

        # LOGGER.debug("Initialed model")
        return w
    def compute_gradient(self, data_instances, fore_gradient, fit_intercept):
        """
        Compute hetero-regression gradient
        Parameters
        ----------
        data_instances: Table, input data
        fore_gradient: Table, fore_gradient
        fit_intercept: bool, if model has intercept or not

        Returns
        ----------
        Table
            the hetero regression model's gradient
        """

        feature_num = data_overview.get_features_shape(data_instances)
        data_count = data_instances.count()
        is_sparse = data_overview.is_sparse_data(data_instances)

        if data_count * feature_num > 100:
            LOGGER.debug("Use apply partitions")
            feat_join_grad = data_instances.join(fore_gradient,
                                                 lambda d, g: (d.features, g))
            f = functools.partial(self.__apply_cal_gradient,
                                  fixed_point_encoder=self.fixed_point_encoder,
                                  is_sparse=is_sparse)
            gradient_sum = feat_join_grad.applyPartitions(f)
            gradient_sum = gradient_sum.reduce(lambda x, y: x + y)
            if fit_intercept:
                # bias_grad = np.sum(fore_gradient)
                bias_grad = fore_gradient.reduce(lambda x, y: x + y)
                gradient_sum = np.append(gradient_sum, bias_grad)
            gradient = gradient_sum / data_count

        else:
            LOGGER.debug(f"Original_method")
            feat_join_grad = data_instances.join(fore_gradient,
                                                 lambda d, g: (d.features, g))
            f = functools.partial(self.__compute_partition_gradient,
                                  fit_intercept=fit_intercept,
                                  is_sparse=is_sparse)
            gradient_partition = feat_join_grad.applyPartitions(f)
            gradient_partition = gradient_partition.reduce(lambda x, y: x + y)

            gradient = gradient_partition / data_count

        return gradient
예제 #16
0
    def fit(self, data):
        """
         Apply standard scale for input data
         Parameters
         ----------
         data: data_instance, input data

         Returns
         ----------
         data:data_instance, data after scale
         mean: list, each column mean value
         std: list, each column standard deviation
         """
        if not self.with_mean and not self.with_std:
            shape = data_overview.get_features_shape(data)
            mean = [0 for _ in range(shape)]
            std = [1 for _ in range(shape)]
            return data, mean, std

        else:
            summary_obj = MultivariateStatisticalSummary(data, -1)
            mean = None
            std = None
            header = get_header(data)

            if self.with_mean:
                mean = summary_obj.get_mean()
                mean = [mean[key] for key in header]

            if self.with_std:
                std = summary_obj.get_std_variance()
                std = [std[key] for key in header]

            if not mean and std:
                mean = [0 for _ in std]
            elif mean and not std:
                std = [1 for _ in mean]

            if not mean or not std:
                raise ValueError("mean or std is None")

            f = functools.partial(self.__scale, mean=mean, std=std)
            data = data.mapValues(f)

            return data, mean, std
예제 #17
0
    def predict(self, data_instances, predict_param):
        if not self.has_sychronized_encryption:
            self.__synchronize_encryption()
            self.__load_arbiter_model()
        else:
            LOGGER.info("in predict, has synchronize encryption information")

        from federatedml.statistic.data_overview import get_features_shape
        feature_shape = get_features_shape(data_instances)
        LOGGER.debug("Shape of coef_ : {}, feature shape: {}".format(
            len(self.coef_), feature_shape))

        wx = self.compute_wx(data_instances, self.coef_, self.intercept_)

        if self.use_encrypt:
            encrypted_wx_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.predict_wx)
            federation.remote(wx,
                              name=self.transfer_variable.predict_wx.name,
                              tag=encrypted_wx_id,
                              role=consts.ARBITER,
                              idx=0)
            predict_result_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.predict_result)
            predict_result = federation.get(
                name=self.transfer_variable.predict_result.name,
                tag=predict_result_id,
                idx=0)
            # local_predict_table = predict_result.collect()
            predict_result_table = predict_result.join(
                data_instances, lambda p, d: (d.label, None, p))
        else:
            pred_prob = wx.mapValues(lambda x: activation.sigmoid(x))
            pred_label = self.classified(pred_prob, predict_param.threshold)
            if predict_param.with_proba:
                predict_result = data_instances.mapValues(lambda x: x.label)
                predict_result = predict_result.join(pred_prob, lambda x, y:
                                                     (x, y))
            else:
                predict_result = data_instances.mapValues(lambda x:
                                                          (x.label, None))
            predict_result_table = predict_result.join(
                pred_label, lambda x, y: (x[0], x[1], y))
        return predict_result_table
예제 #18
0
    def approxiQuantile(data_instances, cols, params):
        # cols == -1 means all features
        if cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            cols = [i for i in range(features_shape)]

        if isinstance(cols, int):
            cols = [cols]

        num_of_qs = len(cols)
        summary_list = []
        for _ in range(num_of_qs):
            quantile_summaries = QuantileSummaries(
                compress_thres=params.compress_thres,
                head_size=params.head_size,
                error=params.error)
            summary_list.append(quantile_summaries)
        QuantileBinning.insert_datas(data_instances, summary_list, cols)
        return summary_list
예제 #19
0
    def fit(self, data_instances):
        LOGGER.info("Enter hetero_lr_guest fit")
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get("header")
        data_instances = data_instances.mapValues(HeteroLRGuest.load_data)

        public_key = federation.get(
            name=self.transfer_variable.paillier_pubkey.name,
            tag=self.transfer_variable.generate_transferid(
                self.transfer_variable.paillier_pubkey),
            idx=0)
        LOGGER.info("Get public_key from arbiter:{}".format(public_key))
        self.encrypt_operator.set_public_key(public_key)

        LOGGER.info("Generate mini-batch from input data")
        mini_batch_obj = MiniBatch(data_instances, batch_size=self.batch_size)
        batch_num = mini_batch_obj.batch_nums
        if self.batch_size == -1:
            LOGGER.info(
                "batch size is -1, set it to the number of data in data_instances"
            )
            self.batch_size = data_instances.count()

        batch_info = {"batch_size": self.batch_size, "batch_num": batch_num}
        LOGGER.info("batch_info:{}".format(batch_info))
        federation.remote(batch_info,
                          name=self.transfer_variable.batch_info.name,
                          tag=self.transfer_variable.generate_transferid(
                              self.transfer_variable.batch_info),
                          role=consts.HOST,
                          idx=0)
        LOGGER.info("Remote batch_info to Host")
        federation.remote(batch_info,
                          name=self.transfer_variable.batch_info.name,
                          tag=self.transfer_variable.generate_transferid(
                              self.transfer_variable.batch_info),
                          role=consts.ARBITER,
                          idx=0)
        LOGGER.info("Remote batch_info to Arbiter")

        LOGGER.info("Start initialize model.")
        LOGGER.info("fit_intercept:{}".format(
            self.init_param_obj.fit_intercept))
        model_shape = data_overview.get_features_shape(data_instances)
        weight = self.initializer.init_model(model_shape,
                                             init_params=self.init_param_obj)
        if self.init_param_obj.fit_intercept is True:
            self.coef_ = weight[:-1]
            self.intercept_ = weight[-1]
        else:
            self.coef_ = weight

        is_send_all_batch_index = False
        self.n_iter_ = 0
        index_data_inst_map = {}

        while self.n_iter_ < self.max_iter:
            LOGGER.info("iter:{}".format(self.n_iter_))
            # each iter will get the same batach_data_generator
            batch_data_generator = mini_batch_obj.mini_batch_data_generator(
                result='index')

            batch_index = 0
            for batch_data_index in batch_data_generator:
                LOGGER.info("batch:{}".format(batch_index))
                if not is_send_all_batch_index:
                    LOGGER.info("remote mini-batch index to Host")
                    federation.remote(
                        batch_data_index,
                        name=self.transfer_variable.batch_data_index.name,
                        tag=self.transfer_variable.generate_transferid(
                            self.transfer_variable.batch_data_index,
                            self.n_iter_, batch_index),
                        role=consts.HOST,
                        idx=0)
                    if batch_index >= mini_batch_obj.batch_nums - 1:
                        is_send_all_batch_index = True

                # Get mini-batch train data
                if len(index_data_inst_map) < batch_num:
                    batch_data_inst = data_instances.join(
                        batch_data_index, lambda data_inst, index: data_inst)
                    index_data_inst_map[batch_index] = batch_data_inst
                else:
                    batch_data_inst = index_data_inst_map[batch_index]

                # transforms features of raw input 'batch_data_inst' into more representative features 'batch_feat_inst'
                batch_feat_inst = self.transform(batch_data_inst)

                # guest/host forward
                self.compute_forward(batch_feat_inst, self.coef_,
                                     self.intercept_)
                host_forward = federation.get(
                    name=self.transfer_variable.host_forward_dict.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.host_forward_dict, self.n_iter_,
                        batch_index),
                    idx=0)
                LOGGER.info("Get host_forward from host")
                aggregate_forward_res = self.aggregate_forward(host_forward)
                en_aggregate_wx = aggregate_forward_res.mapValues(
                    lambda v: v[0])
                en_aggregate_wx_square = aggregate_forward_res.mapValues(
                    lambda v: v[1])

                # compute [[d]]
                if self.gradient_operator is None:
                    self.gradient_operator = HeteroLogisticGradient(
                        self.encrypt_operator)
                fore_gradient = self.gradient_operator.compute_fore_gradient(
                    batch_feat_inst, en_aggregate_wx)
                federation.remote(
                    fore_gradient,
                    name=self.transfer_variable.fore_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.fore_gradient, self.n_iter_,
                        batch_index),
                    role=consts.HOST,
                    idx=0)

                LOGGER.info("Remote fore_gradient to Host")
                # compute guest gradient and loss
                guest_gradient, loss = self.gradient_operator.compute_gradient_and_loss(
                    batch_feat_inst, fore_gradient, en_aggregate_wx,
                    en_aggregate_wx_square, self.fit_intercept)

                # loss regulation if necessary
                if self.updater is not None:
                    guest_loss_regular = self.updater.loss_norm(self.coef_)
                    loss += self.encrypt_operator.encrypt(guest_loss_regular)

                federation.remote(
                    guest_gradient,
                    name=self.transfer_variable.guest_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.guest_gradient, self.n_iter_,
                        batch_index),
                    role=consts.ARBITER,
                    idx=0)
                LOGGER.info("Remote guest_gradient to arbiter")

                optim_guest_gradient = federation.get(
                    name=self.transfer_variable.guest_optim_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.guest_optim_gradient,
                        self.n_iter_, batch_index),
                    idx=0)
                LOGGER.info("Get optim_guest_gradient from arbiter")

                # update model
                LOGGER.info("update_model")
                self.update_model(optim_guest_gradient)

                # update local model that transforms features of raw input 'batch_data_inst'
                training_info = {
                    "iteration": self.n_iter_,
                    "batch_index": batch_index
                }
                self.update_local_model(fore_gradient, batch_data_inst,
                                        self.coef_, **training_info)

                # Get loss regulation from Host if regulation is set
                if self.updater is not None:
                    en_host_loss_regular = federation.get(
                        name=self.transfer_variable.host_loss_regular.name,
                        tag=self.transfer_variable.generate_transferid(
                            self.transfer_variable.host_loss_regular,
                            self.n_iter_, batch_index),
                        idx=0)
                    LOGGER.info("Get host_loss_regular from Host")
                    loss += en_host_loss_regular

                federation.remote(
                    loss,
                    name=self.transfer_variable.loss.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.loss, self.n_iter_,
                        batch_index),
                    role=consts.ARBITER,
                    idx=0)
                LOGGER.info("Remote loss to arbiter")

                # is converge of loss in arbiter
                batch_index += 1

            is_stopped = federation.get(
                name=self.transfer_variable.is_stopped.name,
                tag=self.transfer_variable.generate_transferid(
                    self.transfer_variable.is_stopped, self.n_iter_,
                    batch_index),
                idx=0)
            LOGGER.info("Get is_stop flag from arbiter:{}".format(is_stopped))

            self.n_iter_ += 1
            if is_stopped:
                LOGGER.info(
                    "Get stop signal from arbiter, model is converged, iter:{}"
                    .format(self.n_iter_))
                break

        LOGGER.info("Reach max iter {}, train model finish!".format(
            self.max_iter))
예제 #20
0
    def fit(self, data):
        """
         Apply standard scale for input data
         Parameters
         ----------
         data: data_instance, input data

         Returns
         ----------
         data:data_instance, data after scale
         mean: list, each column mean value
         std: list, each column standard deviation
         """
        if not self.with_mean and not self.with_std:
            shape = data_overview.get_features_shape(data)
            mean = [0 for _ in range(shape)]
            std = [1 for _ in range(shape)]
            self.scale_column_idx = [i for i in range(shape)]
            standard_scale_cols_conf = [mean, std, self.scale_column_idx]
            return data, standard_scale_cols_conf
        else:
            data_shape = data_overview.get_features_shape(data)
            if self.area == 'col':
                if isinstance(self.scale_column_idx, list):
                    max_col_idx = max(self.scale_column_idx)
                    if max_col_idx >= data_shape:
                        raise ValueError(
                            "max column index in area is:{}, should less than data shape:{}"
                            .format(max_col_idx, data_shape))
                    self.scale_column_idx.sort()
                else:
                    LOGGER.warning(
                        "scale_column_idx should be a list, but not:{}, set scale column to all columns"
                        .format(type(self.scale_column_idx)))
                    self.scale_column_idx = [i for i in range(data_shape)]
            else:
                self.scale_column_idx = [i for i in range(data_shape)]

            self.scale_column_idx = list(set(self.scale_column_idx))

            summary_obj = MultivariateStatisticalSummary(data, -1)
            mean = None
            std = None
            header = get_header(data)

            if self.with_mean:
                mean = summary_obj.get_mean()
                mean = [mean[key] for key in header]

            if self.with_std:
                std = summary_obj.get_std_variance()
                std = [std[key] for key in header]

            if not mean and std:
                mean = [0 for _ in std]
            elif mean and not std:
                std = [1 for _ in mean]

            if not mean or not std:
                raise ValueError("mean or std is None")

            f = functools.partial(self.__scale,
                                  mean=mean,
                                  std=std,
                                  process_cols_list=self.scale_column_idx)
            data = data.mapValues(f)

            standard_scale_cols_conf = [mean, std, self.scale_column_idx]

            return data, standard_scale_cols_conf
예제 #21
0
    def predict(self, data_instances):
        if not self.need_run:
            return data_instances

        if not self.has_sychronized_encryption:
            self.__synchronize_encryption(mode='predict')
            self.__load_arbiter_model()
        else:
            LOGGER.info("in predict, has synchronize encryption information")

        feature_shape = get_features_shape(data_instances)
        LOGGER.debug("Shape of coef_ : {}, feature shape: {}".format(
            len(self.coef_), feature_shape))
        local_data = data_instances.first()
        LOGGER.debug("One data, features: {}".format(local_data[1].features))
        wx = self.compute_wx(data_instances, self.coef_, self.intercept_)

        if self.use_encrypt:
            encrypted_wx_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.predict_wx)
            LOGGER.debug("Host encrypted wx id: {}".format(encrypted_wx_id))
            LOGGER.debug("Start to remote wx: {}, transfer_id: {}".format(
                wx, encrypted_wx_id))
            federation.remote(wx,
                              name=self.transfer_variable.predict_wx.name,
                              tag=encrypted_wx_id,
                              role=consts.ARBITER,
                              idx=0)
            predict_result_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.predict_result)
            LOGGER.debug("predict_result_id: {}".format(predict_result_id))

            predict_result = federation.get(
                name=self.transfer_variable.predict_result.name,
                tag=predict_result_id,
                idx=0)
            # local_predict_table = predict_result.collect()
            LOGGER.debug(
                "predict_result count: {}, data_instances count: {}".format(
                    predict_result.count(), data_instances.count()))

            predict_result_table = predict_result.join(
                data_instances,
                lambda p, d: [d.label, None, p, {
                    "0": None,
                    "1": None
                }])

        else:
            pred_prob = wx.mapValues(lambda x: activation.sigmoid(x))
            pred_label = self.classified(pred_prob,
                                         self.predict_param.threshold)
            if self.predict_param.with_proba:
                predict_result = data_instances.mapValues(lambda x: x.label)
                predict_result = predict_result.join(pred_prob, lambda x, y:
                                                     (x, y))
            else:
                predict_result = data_instances.mapValues(lambda x:
                                                          (x.label, None))
            predict_result_table = predict_result.join(
                pred_label,
                lambda x, y: [x[0], y, x[1], {
                    "0": None,
                    "1": None
                }])

        LOGGER.debug("Finish predict")

        LOGGER.debug("In host predict, predict_result_table is : {}".format(
            predict_result_table.first()))
        return predict_result_table
예제 #22
0
    def fit(self, data_instances):
        LOGGER.info("Enter hetero_lr host")
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get("header")
        public_key = federation.get(
            name=self.transfer_variable.paillier_pubkey.name,
            tag=self.transfer_variable.generate_transferid(
                self.transfer_variable.paillier_pubkey),
            idx=0)

        LOGGER.info("Get public_key from arbiter:{}".format(public_key))
        self.encrypt_operator.set_public_key(public_key)

        batch_info = federation.get(
            name=self.transfer_variable.batch_info.name,
            tag=self.transfer_variable.generate_transferid(
                self.transfer_variable.batch_info),
            idx=0)
        LOGGER.info("Get batch_info from guest:" + str(batch_info))
        self.batch_size = batch_info["batch_size"]
        self.batch_num = batch_info["batch_num"]

        LOGGER.info("Start initialize model.")
        model_shape = data_overview.get_features_shape(data_instances)

        if self.init_param_obj.fit_intercept:
            self.init_param_obj.fit_intercept = False

        if self.fit_intercept:
            self.fit_intercept = False

        self.coef_ = self.initializer.init_model(
            model_shape, init_params=self.init_param_obj)

        self.n_iter_ = 0
        index_data_inst_map = {}

        while self.n_iter_ < self.max_iter:
            LOGGER.info("iter:" + str(self.n_iter_))
            batch_index = 0
            while batch_index < self.batch_num:
                LOGGER.info("batch:{}".format(batch_index))
                # set batch_data
                if len(self.batch_index_list) < self.batch_num:
                    batch_data_index = federation.get(
                        name=self.transfer_variable.batch_data_index.name,
                        tag=self.transfer_variable.generate_transferid(
                            self.transfer_variable.batch_data_index,
                            self.n_iter_, batch_index),
                        idx=0)
                    LOGGER.info("Get batch_index from Guest")

                    batch_size = batch_data_index.count()
                    if batch_size < consts.MIN_BATCH_SIZE and batch_size != -1:
                        raise ValueError(
                            "Batch size get from guest should not less than 10, except -1, batch_size is {}"
                            .format(batch_size))

                    self.batch_index_list.append(batch_data_index)
                else:
                    batch_data_index = self.batch_index_list[batch_index]

                # Get mini-batch train data
                if len(index_data_inst_map) < self.batch_num:
                    batch_data_inst = batch_data_index.join(
                        data_instances, lambda g, d: d)
                    index_data_inst_map[batch_index] = batch_data_inst
                else:
                    batch_data_inst = index_data_inst_map[batch_index]

                LOGGER.info("batch_data_inst size:{}".format(
                    batch_data_inst.count()))
                # transforms features of raw input 'batch_data_inst' into more representative features 'batch_feat_inst'
                batch_feat_inst = self.transform(batch_data_inst)

                # compute forward
                host_forward = self.compute_forward(batch_feat_inst,
                                                    self.coef_,
                                                    self.intercept_)
                federation.remote(
                    host_forward,
                    name=self.transfer_variable.host_forward_dict.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.host_forward_dict, self.n_iter_,
                        batch_index),
                    role=consts.GUEST,
                    idx=0)
                LOGGER.info("Remote host_forward to guest")

                # compute host gradient
                fore_gradient = federation.get(
                    name=self.transfer_variable.fore_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.fore_gradient, self.n_iter_,
                        batch_index),
                    idx=0)
                LOGGER.info("Get fore_gradient from guest")
                if self.gradient_operator is None:
                    self.gradient_operator = HeteroLogisticGradient(
                        self.encrypt_operator)
                host_gradient = self.gradient_operator.compute_gradient(
                    batch_feat_inst, fore_gradient, fit_intercept=False)
                # regulation if necessary
                if self.updater is not None:
                    loss_regular = self.updater.loss_norm(self.coef_)
                    en_loss_regular = self.encrypt_operator.encrypt(
                        loss_regular)
                    federation.remote(
                        en_loss_regular,
                        name=self.transfer_variable.host_loss_regular.name,
                        tag=self.transfer_variable.generate_transferid(
                            self.transfer_variable.host_loss_regular,
                            self.n_iter_, batch_index),
                        role=consts.GUEST,
                        idx=0)
                    LOGGER.info("Remote host_loss_regular to guest")

                federation.remote(
                    host_gradient,
                    name=self.transfer_variable.host_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.host_gradient, self.n_iter_,
                        batch_index),
                    role=consts.ARBITER,
                    idx=0)
                LOGGER.info("Remote host_gradient to arbiter")

                # Get optimize host gradient and update model
                optim_host_gradient = federation.get(
                    name=self.transfer_variable.host_optim_gradient.name,
                    tag=self.transfer_variable.generate_transferid(
                        self.transfer_variable.host_optim_gradient,
                        self.n_iter_, batch_index),
                    idx=0)
                LOGGER.info("Get optim_host_gradient from arbiter")

                LOGGER.info("update_model")
                self.update_model(optim_host_gradient)

                # update local model that transforms features of raw input 'batch_data_inst'
                training_info = {
                    "iteration": self.n_iter_,
                    "batch_index": batch_index
                }
                self.update_local_model(fore_gradient, batch_data_inst,
                                        self.coef_, **training_info)

                # is converge

                batch_index += 1
                # if is_stopped:
                #    break

            is_stopped = federation.get(
                name=self.transfer_variable.is_stopped.name,
                tag=self.transfer_variable.generate_transferid(
                    self.transfer_variable.is_stopped, self.n_iter_,
                    batch_index),
                idx=0)
            LOGGER.info("Get is_stop flag from arbiter:{}".format(is_stopped))

            self.n_iter_ += 1
            if is_stopped:
                LOGGER.info(
                    "Get stop signal from arbiter, model is converged, iter:{}"
                    .format(self.n_iter_))
                break

        LOGGER.info("Reach max iter {}, train model finish!".format(
            self.max_iter))
예제 #23
0
    def fit(self, data_instances):
        LOGGER.info("开始纵向逻辑回归")
        #检查数据
        self._abnormal_detection(data_instances)
        #导入数据
        data_instances = data_instances.mapValues(HdpVflHost.load_data)

        # 下面开始模型的初始化
        data_shape = data_overview.get_features_shape(data_instances)
        LOGGER.info("数据的维度是:{}".format(data_shape))
        self.model = LRModelWeightsHost()
        self.model.initialize(data_shape)

        #批处理模块初始化
        self.batch_generator.register_batch_generator(self.transfer_variable)
        suffix = (data_instances.count(), self.r)
        self.batch_generator.initialize_batch_generator(data_instances,
                                                        suffix=suffix)

        #传输变量初始化
        self.register_gradient_sync(self.transfer_variable)

        #开始正式的循环迭代的阶段
        iteration = 0
        test_suffix = ("iter", )
        while iteration <= self.e:
            for data_inst in self.batch_generator.generator_batch_data():
                LOGGER.info("开始计算数据的内积")
                ir_b = self.model.compute_forwards(data_inst, self.model.w)

                LOGGER.info("开始生成高斯分布需要的:loc、sigma")
                loc, sigma = self.model.gaussian(self.delta, self.epsilon,
                                                 self.L, self.e,
                                                 int(self.r * self.e),
                                                 self.learning_rate,
                                                 data_inst.count(), self.k)

                LOGGER.info("开始对数据添加噪声")
                sec_ir_b = self.model.sec_intermediate_result(ir_b, loc, sigma)
                suffix_t = test_suffix + (iteration, )
                LOGGER.info("当前的suffix_t值为:{}".format(suffix_t))
                LOGGER.info("开始发送给guest端sec_it_b")
                # test_transfer.send(obj=sec_ir_b,role=consts.GUEST,suffix=suffix_t)
                self.ir_b.remote(obj=sec_ir_b,
                                 role=consts.GUEST,
                                 suffix=suffix_t)

                LOGGER.info("开始从guest端接收sec_ir_a")
                sec_ir_a = self.ir_a.get(suffix=suffix_t)

                LOGGER.info("开始计算gradient_b")
                gradient_b = self.model.compute_gradient(
                    data_inst, sec_ir_a[0], data_inst.count())

                LOGGER.info("开始更新模型参数")
                self.model.update_model(gradient_b, self.learning_rate,
                                        self.lamb)

                LOGGER.info("开始进行梯度剪切部分")
                self.model.norm_clip(self.k)

                iteration += 1

        LOGGER.info("训练正式结束")
        LOGGER.info("host方的模型参数:{}".format(self.model.w))

        return self.model.w
예제 #24
0
    def _get_data_shape(self, data):
        if not self.data_shape:
            self.data_shape = data_overview.get_features_shape(data)

        return self.data_shape
예제 #25
0
 def get_features_shape(self, data_instances):
     if self.feature_shape is not None:
         return self.feature_shape
     return data_overview.get_features_shape(data_instances)