Exemplo n.º 1
0
def compute_gradient(data_instances, fore_gradient, fit_intercept):
    """
    Compute hetero-regression gradient
    Parameters
    ----------
    data_instances: DTable, input data
    fore_gradient: DTable, fore_gradient
    fit_intercept: bool, if model has intercept or not

    Returns
    ----------
    DTable
        the hetero regression model's gradient
    """
    feat_join_grad = data_instances.join(fore_gradient, lambda d, g:
                                         (d.features, g))
    is_sparse = data_overview.is_sparse_data(data_instances)
    f = functools.partial(__compute_partition_gradient,
                          fit_intercept=fit_intercept,
                          is_sparse=is_sparse)
    gradient_partition = feat_join_grad.applyPartitions(f)
    gradient_partition = gradient_partition.reduce(lambda x, y: x + y)

    gradient = gradient_partition / data_instances.count()

    return gradient
Exemplo n.º 2
0
    def query_quantile_point(self, data_instances, cols, query_points):
        # self.cols = cols
        # self._init_cols(data_instances)

        is_sparse = data_overview.is_sparse_data(data_instances)
        if self.summary_dict is None:
            f = functools.partial(self.approxi_quantile,
                                  cols_dict=self.bin_inner_param.bin_cols_map,
                                  params=self.params,
                                  header=self.header,
                                  abnormal_list=self.abnormal_list,
                                  is_sparse=is_sparse)
            summary_dict = data_instances.mapPartitions(f)
            summary_dict = summary_dict.reduce(self.merge_summary_dict)
            self.summary_dict = summary_dict
        else:
            summary_dict = self.summary_dict

        if isinstance(query_points, (int, float)):
            query_dict = {}
            for col_name in cols:
                query_dict[col_name] = query_points
        elif isinstance(query_points, dict):
            query_dict = query_points
        else:
            raise ValueError(
                "query_points has wrong type, should be a float, int or dict")

        result = {}
        for col_name, query_point in query_dict.items():
            summary = summary_dict[col_name]
            result[col_name] = summary.query(query_point)
        return result
Exemplo n.º 3
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...                         # Other features
                            }
        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)
        # self._init_cols(data_instances)
        percent_value = 1.0 / self.bin_num

        # calculate the split points
        percentile_rate = [i * percent_value for i in range(1, self.bin_num)]
        percentile_rate.append(1.0)
        is_sparse = data_overview.is_sparse_data(data_instances)

        # self._fit_split_point_deprecate(data_instances, is_sparse, percentile_rate)
        self._fit_split_point(data_instances, is_sparse, percentile_rate)

        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Exemplo n.º 4
0
    def convert_feature_to_bin(self, data_instances, split_points=None):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if split_points is None:
            split_points = self.bin_results.all_split_points
        else:
            for col_name, sp in split_points.items():
                self.bin_results.put_col_split_points(col_name, sp)

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        header = get_header(data_instances)
        bin_sparse = self.get_sparse_bin(
            self.bin_inner_param.transform_bin_indexes, split_points, header)
        split_points_result = self.bin_results.get_split_points_array(
            self.bin_inner_param.transform_bin_names)

        return new_data, split_points_result, bin_sparse
Exemplo n.º 5
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)
        # self._init_cols(data_instances)
        percent_value = 1.0 / self.bin_num

        # calculate the split points
        percentile_rate = [i * percent_value for i in range(1, self.bin_num)]
        percentile_rate.append(1.0)
        is_sparse = data_overview.is_sparse_data(data_instances)

        if self.summary_dict is None:
            f = functools.partial(self.approxi_quantile,
                                  params=self.params,
                                  abnormal_list=self.abnormal_list,
                                  cols_dict=self.bin_inner_param.bin_cols_map,
                                  header=self.header,
                                  is_sparse=is_sparse)
            summary_dict = data_instances.mapPartitions(f)
            summary_dict = summary_dict.reduce(self.merge_summary_dict)
            if is_sparse:
                total_count = data_instances.count()
                for _, summary_obj in summary_dict.items():
                    summary_obj.set_total_count(total_count)

            self.summary_dict = summary_dict
        else:
            summary_dict = self.summary_dict
        # split_points = {}
        for col_name, summary in summary_dict.items():
            split_point = []
            for percen_rate in percentile_rate:
                s_p = summary.query(percen_rate)
                if s_p not in split_point:
                    split_point.append(s_p)
            self.bin_results.put_col_split_points(col_name, split_point)

        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Exemplo n.º 6
0
    def convert_feature_to_bin(self, data_instances, transform_cols_idx=-1, split_points=None):
        self._init_cols(data_instances)
        if transform_cols_idx is None:
            return data_instances, None, None

        if transform_cols_idx == -1:
            transform_cols_idx = self.cols_index
        else:
            assert isinstance(transform_cols_idx, (list, tuple))
            LOGGER.debug('In convert_feature_to_bin, transform_cols_idx: {}, col_index: {}, cols: {}'.format(
                transform_cols_idx, self.cols_index, self.cols
            ))
            for col in transform_cols_idx:
                if col not in self.cols_index:
                    raise RuntimeError("Binning Transform cols: {} should be fit before transform".format(col))

        transform_cols_idx = list(map(int, transform_cols_idx))
        if split_points is None:
            split_points = self.split_points

        is_sparse = data_overview.is_sparse_data(data_instances)

        LOGGER.debug("In convert_feature_to_bin, split_points: {}, header: {}, transform_cols_idx: {}".format(
            split_points, self.header, transform_cols_idx
        ))

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  transform_cols_idx=transform_cols_idx,
                                  split_points_dict=split_points,
                                  header=self.header,
                                  abnormal_list=self.abnormal_list
                                  )
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  transform_cols_idx=transform_cols_idx,
                                  split_points_dict=split_points,
                                  header=self.header,
                                  abnormal_list=self.abnormal_list)
            new_data = data_instances.mapValues(f)
        new_data.schema = {"header": self.header}
        bin_sparse = self.get_sparse_bin(transform_cols_idx, split_points)
        split_points_result = []
        for idx, col_name in enumerate(self.header):
            if col_name not in self.split_points:
                continue
            s_ps = self.split_points[col_name]
            s_ps = np.array(s_ps)
            split_points_result.append(s_ps)
        split_points_result = np.array(split_points_result)
        assert len(split_points_result) == len(self.split_points)
        LOGGER.debug("Original split_points: {}, changed split_point: {}".format(self.split_points, split_points_result))
        LOGGER.debug("In convert_feature_to_bin, new_data: {}, split_point_result: {}, bin_sparse: {}".format(
            new_data, split_points_result, bin_sparse
        ))
        return new_data, split_points_result, bin_sparse
Exemplo n.º 7
0
    def init_bucket(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        init_bucket_param = copy.deepcopy(self.params)
        init_bucket_param.bin_num = self.optimal_param.init_bin_nums
        if self.optimal_param.init_bucket_method == consts.QUANTILE:
            init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param,
                                                   allow_duplicate=False)
        else:
            init_binning_obj = BucketBinning(params=init_bucket_param)
        init_binning_obj.set_bin_inner_param(self.bin_inner_param)
        init_split_points = init_binning_obj.fit_split_points(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        bucket_dict = dict()
        for col_name, sps in init_split_points.items():

            bucket_list = []
            for idx, sp in enumerate(sps):
                bucket = bucket_info.Bucket(idx,
                                            self.adjustment_factor,
                                            right_bound=sp)
                if idx == 0:
                    bucket.left_bound = -math.inf
                    bucket.set_left_neighbor(None)
                else:
                    bucket.left_bound = sps[idx - 1]
                bucket.event_total = self.event_total
                bucket.non_event_total = self.non_event_total
                bucket_list.append(bucket)
            bucket_list[-1].set_right_neighbor(None)
            bucket_dict[col_name] = bucket_list
            # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, "
            #              f"length of list: {len(bucket_list)}")

        convert_func = functools.partial(
            self.convert_data_to_bucket,
            split_points=init_split_points,
            headers=self.header,
            bucket_dict=copy.deepcopy(bucket_dict),
            is_sparse=is_sparse,
            get_bin_num_func=self.get_bin_num)
        bucket_table = data_instances.mapReducePartitions(
            convert_func, self.merge_bucket_list)
        # bucket_table = dict(bucket_table.collect())

        # for k, v in bucket_table.items():
        #     LOGGER.debug(f"[feature] {k}, length of list: {len(v)}")

        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))
        # bucket_table = [(k, v) for k, v in bucket_table.items()]
        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))

        # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions)

        return bucket_table
Exemplo n.º 8
0
    def fit_category_features(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)

        if len(self.bin_inner_param.category_indexes) > 0:
            statics_obj = data_overview.DataStatistics()
            category_col_values = statics_obj.static_all_values(data_instances,
                                                                self.bin_inner_param.category_indexes,
                                                                is_sparse)
            for col_name, split_points in zip(self.bin_inner_param.category_names, category_col_values):
                self.bin_results.put_col_split_points(col_name, split_points)
Exemplo n.º 9
0
    def compute_gradient(self,
                         data_instances,
                         fore_gradient,
                         fit_intercept,
                         need_average=True):
        """
        Compute hetero-regression gradient
        Parameters
        ----------
        data_instances: Table, input data
        fore_gradient: Table, fore_gradient
        fit_intercept: bool, if model has intercept or not
        need_average: bool, gradient needs to be averaged or not

        Returns
        ----------
        Table
            the hetero regression model's gradient
        """

        # feature_num = data_overview.get_features_shape(data_instances)
        # data_count = data_instances.count()
        is_sparse = data_overview.is_sparse_data(data_instances)

        LOGGER.debug("Use apply partitions")
        feat_join_grad = data_instances.join(fore_gradient, lambda d, g:
                                             (d.features, g))
        f = functools.partial(self.__apply_cal_gradient,
                              fixed_point_encoder=self.fixed_point_encoder,
                              is_sparse=is_sparse)
        gradient_sum = feat_join_grad.applyPartitions(f)
        gradient_sum = gradient_sum.reduce(lambda x, y: x + y)
        if fit_intercept:
            # bias_grad = np.sum(fore_gradient)
            bias_grad = fore_gradient.reduce(lambda x, y: x + y)
            gradient_sum = np.append(gradient_sum, bias_grad)

        if need_average:
            gradient = gradient_sum / data_instances.count()
        else:
            gradient = gradient_sum
        """
        else:
            LOGGER.debug(f"Original_method")
            feat_join_grad = data_instances.join(fore_gradient,
                                                 lambda d, g: (d.features, g))
            f = functools.partial(self.__compute_partition_gradient,
                                  fit_intercept=fit_intercept,
                                  is_sparse=is_sparse)
            gradient_partition = feat_join_grad.applyPartitions(f)
            gradient_partition = gradient_partition.reduce(lambda x, y: x + y)

            gradient = gradient_partition / data_count
        """
        return gradient
Exemplo n.º 10
0
    def woe_transformer(data_instances, bin_inner_param, multi_class_bin_res: MultiClassBinResult,
                        abnormal_list=None):
        if abnormal_list is None:
            abnormal_list = []
        bin_res = multi_class_bin_res.bin_results[0]
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_res.all_split_points
        is_sparse = data_overview.is_sparse_data(data_instances)

        def convert(instances):
            if is_sparse:
                all_data = instances.features.get_all_data()
                indice = []
                sparse_value = []
                data_shape = instances.features.get_shape()
                for col_idx, col_value in all_data:
                    if col_idx in transform_cols_idx:
                        if col_value in abnormal_list:
                            indice.append(col_idx)
                            sparse_value.append(col_value)
                            continue
                        # Maybe it is because missing value add in sparse value, but
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        indice.append(col_idx)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        sparse_value.append(woe_value)
                    else:
                        indice.append(col_idx)
                        sparse_value.append(col_value)
                sparse_vector = SparseVector(indice, sparse_value, data_shape)
                instances.features = sparse_vector
            else:
                features = instances.features
                assert isinstance(features, np.ndarray)
                transform_cols_idx_set = set(transform_cols_idx)

                for col_idx, col_value in enumerate(features):
                    if col_idx in transform_cols_idx_set:
                        if col_value in abnormal_list:
                            features[col_idx] = col_value
                            continue
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        features[col_idx] = woe_value
                instances.features = features
            return instances

        return data_instances.mapValues(convert)
Exemplo n.º 11
0
 def _static_sums(self):
     """
     Statics sum, sum_square, max_value, min_value,
     so that variance is available.
     """
     is_sparse = data_overview.is_sparse_data(self.data_instances)
     partition_cal = functools.partial(self.static_in_partition,
                                       cols_index=self.cols_index,
                                       summary_statistics=copy.deepcopy(self.summary_statistics),
                                       is_sparse=is_sparse)
     self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \
         reduce(lambda x, y: self.copy_merge(x, y))
     # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics)
     self.finish_fit_statics = True
Exemplo n.º 12
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        is_sparse = data_overview.is_sparse_data(data_instances)
        if is_sparse:
            raise RuntimeError(
                "Bucket Binning method has not supported sparse data yet.")

        # self._init_cols(data_instances)

        statistics = MultivariateStatisticalSummary(
            data_instances,
            self.bin_inner_param.bin_indexes,
            abnormal_list=self.abnormal_list)
        max_dict = statistics.get_max()
        min_dict = statistics.get_min()
        for col_name, max_value in max_dict.items():
            min_value = min_dict.get(col_name)
            split_points = []
            L = (max_value - min_value) / self.bin_num
            for k in range(self.bin_num - 1):
                s_p = min_value + (k + 1) * L
                split_points.append(s_p)
            split_points.append(max_value)
            # final_split_points[col_name] = split_point
            self.bin_results.put_col_split_points(col_name, split_points)
        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Exemplo n.º 13
0
    def check_containing_missing_value(data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)

        def _sparse_check(instance):
            result = set()
            sparse_data = instance.features.get_all_data()
            for col_idx, col_value in sparse_data:
                if np.isnan(col_value):
                    result.add(col_idx)
            return result

        if is_sparse:
            has_missing_value = data_instances.mapValues(_sparse_check).reduce(
                lambda a, b: a.union(b)
            )
        else:
            has_missing_value = data_instances.mapValues(lambda x: x.features).reduce(operator.add)
            has_missing_value = {idx for idx, value in enumerate(has_missing_value) if np.isnan(value)}
        return has_missing_value
Exemplo n.º 14
0
    def convert_feature_to_woe(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        return new_data
Exemplo n.º 15
0
    def fit_summary(self, data_instances, is_sparse=None):
        if is_sparse is None:
            is_sparse = data_overview.is_sparse_data(data_instances)
            LOGGER.debug(f"is_sparse: {is_sparse}")

        f = functools.partial(self.feature_summary,
                              params=self.params,
                              abnormal_list=self.abnormal_list,
                              cols_dict=self.bin_inner_param.bin_cols_map,
                              header=self.header,
                              is_sparse=is_sparse)
        summary_dict_table = data_instances.mapReducePartitions(
            f, self.copy_merge)
        # summary_dict = dict(summary_dict.collect())

        if is_sparse:
            total_count = data_instances.count()
            summary_dict_table = summary_dict_table.mapValues(
                lambda x: x.set_total_count(total_count))
        return summary_dict_table
Exemplo n.º 16
0
    def get_data_bin(self, data_instances, split_points=None):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        Returns
        -------
        data_bin_table : DTable.

            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{'x1': 1, 'x2': 5, 'x3': 2}
            ...
             ]
        """
        # self._init_cols(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        if split_points is None:
            split_points = self.fit_split_points(data_instances)

        f = functools.partial(self.bin_data,
                              split_points=split_points,
                              cols_dict=self.bin_inner_param.bin_cols_map,
                              header=self.header,
                              is_sparse=is_sparse)
        data_bin_dict = data_instances.mapValues(f)
        return data_bin_dict
Exemplo n.º 17
0
def empty_column_detection(data_instance):

    contains_empty_columns = False
    lost_feat = []
    is_sparse = data_overview.is_sparse_data(data_instance)
    if is_sparse:
        raise ValueError(
            'sparse format empty column detection is not supported for now')
    map_func = functools.partial(column_gathering, )
    map_rs = data_instance.applyPartitions(map_func)
    reduce_rs = map_rs.reduce(merge_column_sets)

    # transform col index to col name
    reduce_rs = np.array(data_instance.schema['header'])[list(reduce_rs)]
    reduce_rs = set(reduce_rs)

    if reduce_rs != set(data_instance.schema['header']):
        lost_feat = list(
            set(data_instance.schema['header']).difference(reduce_rs))
        contains_empty_columns = True

    if contains_empty_columns:
        raise ValueError('column(s) {} contain(s) no values'.format(lost_feat))
Exemplo n.º 18
0
    def init_bucket(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        init_bucket_param = copy.deepcopy(self.params)
        init_bucket_param.bin_num = self.optimal_param.init_bin_nums
        if self.optimal_param.init_bucket_method == consts.QUANTILE:
            init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False)
        else:
            init_binning_obj = BucketBinning(params=init_bucket_param)
        init_binning_obj.set_bin_inner_param(self.bin_inner_param)
        init_split_points = init_binning_obj.fit_split_points(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        bucket_dict = dict()
        for col_name, sps in init_split_points.items():

            # bucket_list = [bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp)
            #                for idx, sp in enumerate(sps)]
            bucket_list = []
            for idx, sp in enumerate(sps):
                bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp)
                if idx == 0:
                    bucket.left_bound = -math.inf
                    bucket.set_left_neighbor(None)
                else:
                    bucket.left_bound = sps[idx - 1]
                bucket.event_total = self.event_total
                bucket.non_event_total = self.non_event_total
                bucket_list.append(bucket)
            bucket_list[-1].set_right_neighbor(None)
            bucket_dict[col_name] = bucket_list
            LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, "
                         f"length of list: {len(bucket_list)}")

        # bucket_table = data_instances.mapPartitions2(convert_func)
        # bucket_table = bucket_table.reduce(self.merge_bucket_list, key_func=lambda key: key[1])
        from fate_arch.common.versions import get_eggroll_version
        version = get_eggroll_version()
        if version.startswith('2.0'):
            convert_func = functools.partial(self.convert_data_to_bucket_old,
                                             split_points=init_split_points,
                                             headers=self.header,
                                             bucket_dict=copy.deepcopy(bucket_dict),
                                             is_sparse=is_sparse,
                                             get_bin_num_func=self.get_bin_num)
            summary_dict = data_instances.mapPartitions(convert_func, use_previous_behavior=False)
            # summary_dict = summary_dict.reduce(self.copy_merge, key_func=lambda key: key[1])
            from federatedml.util.reduce_by_key import reduce
            bucket_table = reduce(summary_dict, self.merge_bucket_list, key_func=lambda key: key[1])
        elif version.startswith('2.2'):
            convert_func = functools.partial(self.convert_data_to_bucket,
                                             split_points=init_split_points,
                                             headers=self.header,
                                             bucket_dict=copy.deepcopy(bucket_dict),
                                             is_sparse=is_sparse,
                                             get_bin_num_func=self.get_bin_num)
            bucket_table = data_instances.mapReducePartitions(convert_func, self.merge_bucket_list)
            bucket_table = dict(bucket_table.collect())
        else:
            raise RuntimeError(f"Cannot recognized eggroll version: {version}")

        for k, v in bucket_table.items():
            LOGGER.debug(f"[feature] {k}, length of list: {len(v)}")

        LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))
        bucket_table = [(k, v) for k, v in bucket_table.items()]
        LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))

        bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions)

        return bucket_table
    def _find_kth_mode(self, data_instances, k):
        """
        Find 1/k mode. If there is a mode that number of which is larger than 1/k of total nums, return this mode and
        its percentage. If there is not, return None, None.

        Parameters
        ----------
        data_instances: DTable
            Original data

        k: int
        """
        is_sparse = is_sparse_data(data_instances)

        def find_mode_candidate(instances, select_cols):
            """
            Find at most k - 1 mode candidates.
            Parameters
            ----------
            instances: Data generator
                Original data
            k: int

            select_cols: list
                Indicates columns that need to be operated.

            is_sparse: bool
                Whether input data format is sparse

            Returns
            -------
            all_candidates: dict
                Each key is col_index and value is a list that contains mode candidates.
            """
            all_candidates = {}
            for col_index in select_cols:
                all_candidates[col_index] = {}

            for _, instant in instances:
                for col_index in select_cols:
                    candidate_dict = all_candidates[col_index]
                    if is_sparse:
                        feature_value = instant.features.get_data(col_index, 0)
                    else:
                        feature_value = instant.features[col_index]
                    if isinstance(feature_value, float):
                        feature_value = round(feature_value, 8)

                    if feature_value in candidate_dict:
                        candidate_dict[feature_value] += 1
                    elif len(candidate_dict) < k - 1:
                        candidate_dict[feature_value] = 1
                    else:
                        to_delete_col = []
                        for key in candidate_dict:
                            candidate_dict[key] -= 1
                            if candidate_dict[key] == 0:
                                to_delete_col.append(key)
                        for d_k in to_delete_col:
                            del candidate_dict[d_k]
            for col_index, candidate_dict in all_candidates.items():
                candidate_dict = {key: 0 for key, _ in candidate_dict.items()}
                all_candidates[col_index] = candidate_dict

            return all_candidates

        def merge_mode_candidate(d1, d2):
            assert len(d1) == len(d2)
            for col_idx, d in d1.items():
                d.update(d2[col_idx])
            return d1

        def merge_candidates_num(candi_1, candi_2):
            assert len(candi_1) == len(candi_2)
            for col_idx, candidate_dict in candi_1.items():
                candi_dict_2 = candi_2[col_idx]
                for feature_value, num in candi_dict_2.items():
                    if feature_value in candidate_dict:
                        candidate_dict[feature_value] += num
                    else:
                        candidate_dict[feature_value] = num
            return candi_1

        def static_candidates_num(instances, select_cols, all_candidates):
            """
            Static number of candidates
            Parameters
            ----------
            instances: Data generator
                Original data

            select_cols: list
                Indicates columns that need to be operated.

            all_candidates: dict
                Each key is col_index and value is a list that contains mode candidates.
            """

            for _, instant in instances:
                for col_index in select_cols:
                    candidate_dict = all_candidates[col_index]
                    if is_sparse:
                        feature_value = instant.features.get_data(
                            col_index, NoneType())
                    else:
                        feature_value = instant.features[col_index]
                    if isinstance(feature_value, float):
                        feature_value = round(feature_value, 8)

                    if feature_value in candidate_dict:
                        candidate_dict[feature_value] += 1

            # mode_result = {}
            # for col_index, candidate_dict in all_candidates.items():
            #     feature_value, nums = sorted(candidate_dict.items(), key=operator.itemgetter(1), reverse=False)[0]
            #     mode_result[col_index] = (feature_value, nums)
            return all_candidates

        find_func = functools.partial(
            find_mode_candidate,
            select_cols=self.selection_properties.select_col_indexes)
        all_candidates = data_instances.mapPartitions(find_func).reduce(
            merge_mode_candidate)
        static_func = functools.partial(
            static_candidates_num,
            select_cols=self.selection_properties.select_col_indexes,
            all_candidates=all_candidates)
        mode_candidate_statics = data_instances.mapPartitions(
            static_func).reduce(merge_candidates_num)
        result = {}
        for col_index, candidate_dict in mode_candidate_statics.items():
            if len(candidate_dict) > 0:
                res = sorted(candidate_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)[0]
            else:
                res = None
            result[col_index] = res

        return result