def compute_gradient(data_instances, fore_gradient, fit_intercept): """ Compute hetero-regression gradient Parameters ---------- data_instances: DTable, input data fore_gradient: DTable, fore_gradient fit_intercept: bool, if model has intercept or not Returns ---------- DTable the hetero regression model's gradient """ feat_join_grad = data_instances.join(fore_gradient, lambda d, g: (d.features, g)) is_sparse = data_overview.is_sparse_data(data_instances) f = functools.partial(__compute_partition_gradient, fit_intercept=fit_intercept, is_sparse=is_sparse) gradient_partition = feat_join_grad.applyPartitions(f) gradient_partition = gradient_partition.reduce(lambda x, y: x + y) gradient = gradient_partition / data_instances.count() return gradient
def query_quantile_point(self, data_instances, cols, query_points): # self.cols = cols # self._init_cols(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) if self.summary_dict is None: f = functools.partial(self.approxi_quantile, cols_dict=self.bin_inner_param.bin_cols_map, params=self.params, header=self.header, abnormal_list=self.abnormal_list, is_sparse=is_sparse) summary_dict = data_instances.mapPartitions(f) summary_dict = summary_dict.reduce(self.merge_summary_dict) self.summary_dict = summary_dict else: summary_dict = self.summary_dict if isinstance(query_points, (int, float)): query_dict = {} for col_name in cols: query_dict[col_name] = query_points elif isinstance(query_points, dict): query_dict = query_points else: raise ValueError( "query_points has wrong type, should be a float, int or dict") result = {} for col_name, query_point in query_dict.items(): summary = summary_dict[col_name] result[col_name] = summary.query(query_point) return result
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ... # Other features } """ header = data_overview.get_header(data_instances) self._default_setting(header) # self._init_cols(data_instances) percent_value = 1.0 / self.bin_num # calculate the split points percentile_rate = [i * percent_value for i in range(1, self.bin_num)] percentile_rate.append(1.0) is_sparse = data_overview.is_sparse_data(data_instances) # self._fit_split_point_deprecate(data_instances, is_sparse, percentile_rate) self._fit_split_point(data_instances, is_sparse, percentile_rate) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def convert_feature_to_bin(self, data_instances, split_points=None): is_sparse = data_overview.is_sparse_data(data_instances) schema = data_instances.schema if split_points is None: split_points = self.bin_results.all_split_points else: for col_name, sp in split_points.items(): self.bin_results.put_col_split_points(col_name, sp) if is_sparse: f = functools.partial(self._convert_sparse_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='bin_num') new_data = data_instances.mapValues(f) else: f = functools.partial(self._convert_dense_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='bin_num') new_data = data_instances.mapValues(f) new_data.schema = schema header = get_header(data_instances) bin_sparse = self.get_sparse_bin( self.bin_inner_param.transform_bin_indexes, split_points, header) split_points_result = self.bin_results.get_split_points_array( self.bin_inner_param.transform_bin_names) return new_data, split_points_result, bin_sparse
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ...] # Other features """ header = data_overview.get_header(data_instances) self._default_setting(header) # self._init_cols(data_instances) percent_value = 1.0 / self.bin_num # calculate the split points percentile_rate = [i * percent_value for i in range(1, self.bin_num)] percentile_rate.append(1.0) is_sparse = data_overview.is_sparse_data(data_instances) if self.summary_dict is None: f = functools.partial(self.approxi_quantile, params=self.params, abnormal_list=self.abnormal_list, cols_dict=self.bin_inner_param.bin_cols_map, header=self.header, is_sparse=is_sparse) summary_dict = data_instances.mapPartitions(f) summary_dict = summary_dict.reduce(self.merge_summary_dict) if is_sparse: total_count = data_instances.count() for _, summary_obj in summary_dict.items(): summary_obj.set_total_count(total_count) self.summary_dict = summary_dict else: summary_dict = self.summary_dict # split_points = {} for col_name, summary in summary_dict.items(): split_point = [] for percen_rate in percentile_rate: s_p = summary.query(percen_rate) if s_p not in split_point: split_point.append(s_p) self.bin_results.put_col_split_points(col_name, split_point) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def convert_feature_to_bin(self, data_instances, transform_cols_idx=-1, split_points=None): self._init_cols(data_instances) if transform_cols_idx is None: return data_instances, None, None if transform_cols_idx == -1: transform_cols_idx = self.cols_index else: assert isinstance(transform_cols_idx, (list, tuple)) LOGGER.debug('In convert_feature_to_bin, transform_cols_idx: {}, col_index: {}, cols: {}'.format( transform_cols_idx, self.cols_index, self.cols )) for col in transform_cols_idx: if col not in self.cols_index: raise RuntimeError("Binning Transform cols: {} should be fit before transform".format(col)) transform_cols_idx = list(map(int, transform_cols_idx)) if split_points is None: split_points = self.split_points is_sparse = data_overview.is_sparse_data(data_instances) LOGGER.debug("In convert_feature_to_bin, split_points: {}, header: {}, transform_cols_idx: {}".format( split_points, self.header, transform_cols_idx )) if is_sparse: f = functools.partial(self._convert_sparse_data, transform_cols_idx=transform_cols_idx, split_points_dict=split_points, header=self.header, abnormal_list=self.abnormal_list ) new_data = data_instances.mapValues(f) else: f = functools.partial(self._convert_dense_data, transform_cols_idx=transform_cols_idx, split_points_dict=split_points, header=self.header, abnormal_list=self.abnormal_list) new_data = data_instances.mapValues(f) new_data.schema = {"header": self.header} bin_sparse = self.get_sparse_bin(transform_cols_idx, split_points) split_points_result = [] for idx, col_name in enumerate(self.header): if col_name not in self.split_points: continue s_ps = self.split_points[col_name] s_ps = np.array(s_ps) split_points_result.append(s_ps) split_points_result = np.array(split_points_result) assert len(split_points_result) == len(self.split_points) LOGGER.debug("Original split_points: {}, changed split_point: {}".format(self.split_points, split_points_result)) LOGGER.debug("In convert_feature_to_bin, new_data: {}, split_point_result: {}, bin_sparse: {}".format( new_data, split_points_result, bin_sparse )) return new_data, split_points_result, bin_sparse
def init_bucket(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) init_bucket_param = copy.deepcopy(self.params) init_bucket_param.bin_num = self.optimal_param.init_bin_nums if self.optimal_param.init_bucket_method == consts.QUANTILE: init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False) else: init_binning_obj = BucketBinning(params=init_bucket_param) init_binning_obj.set_bin_inner_param(self.bin_inner_param) init_split_points = init_binning_obj.fit_split_points(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) bucket_dict = dict() for col_name, sps in init_split_points.items(): bucket_list = [] for idx, sp in enumerate(sps): bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) if idx == 0: bucket.left_bound = -math.inf bucket.set_left_neighbor(None) else: bucket.left_bound = sps[idx - 1] bucket.event_total = self.event_total bucket.non_event_total = self.non_event_total bucket_list.append(bucket) bucket_list[-1].set_right_neighbor(None) bucket_dict[col_name] = bucket_list # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, " # f"length of list: {len(bucket_list)}") convert_func = functools.partial( self.convert_data_to_bucket, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) bucket_table = data_instances.mapReducePartitions( convert_func, self.merge_bucket_list) # bucket_table = dict(bucket_table.collect()) # for k, v in bucket_table.items(): # LOGGER.debug(f"[feature] {k}, length of list: {len(v)}") # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = [(k, v) for k, v in bucket_table.items()] # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions) return bucket_table
def fit_category_features(self, data_instances): is_sparse = data_overview.is_sparse_data(data_instances) if len(self.bin_inner_param.category_indexes) > 0: statics_obj = data_overview.DataStatistics() category_col_values = statics_obj.static_all_values(data_instances, self.bin_inner_param.category_indexes, is_sparse) for col_name, split_points in zip(self.bin_inner_param.category_names, category_col_values): self.bin_results.put_col_split_points(col_name, split_points)
def compute_gradient(self, data_instances, fore_gradient, fit_intercept, need_average=True): """ Compute hetero-regression gradient Parameters ---------- data_instances: Table, input data fore_gradient: Table, fore_gradient fit_intercept: bool, if model has intercept or not need_average: bool, gradient needs to be averaged or not Returns ---------- Table the hetero regression model's gradient """ # feature_num = data_overview.get_features_shape(data_instances) # data_count = data_instances.count() is_sparse = data_overview.is_sparse_data(data_instances) LOGGER.debug("Use apply partitions") feat_join_grad = data_instances.join(fore_gradient, lambda d, g: (d.features, g)) f = functools.partial(self.__apply_cal_gradient, fixed_point_encoder=self.fixed_point_encoder, is_sparse=is_sparse) gradient_sum = feat_join_grad.applyPartitions(f) gradient_sum = gradient_sum.reduce(lambda x, y: x + y) if fit_intercept: # bias_grad = np.sum(fore_gradient) bias_grad = fore_gradient.reduce(lambda x, y: x + y) gradient_sum = np.append(gradient_sum, bias_grad) if need_average: gradient = gradient_sum / data_instances.count() else: gradient = gradient_sum """ else: LOGGER.debug(f"Original_method") feat_join_grad = data_instances.join(fore_gradient, lambda d, g: (d.features, g)) f = functools.partial(self.__compute_partition_gradient, fit_intercept=fit_intercept, is_sparse=is_sparse) gradient_partition = feat_join_grad.applyPartitions(f) gradient_partition = gradient_partition.reduce(lambda x, y: x + y) gradient = gradient_partition / data_count """ return gradient
def woe_transformer(data_instances, bin_inner_param, multi_class_bin_res: MultiClassBinResult, abnormal_list=None): if abnormal_list is None: abnormal_list = [] bin_res = multi_class_bin_res.bin_results[0] transform_cols_idx = bin_inner_param.transform_bin_indexes split_points_dict = bin_res.all_split_points is_sparse = data_overview.is_sparse_data(data_instances) def convert(instances): if is_sparse: all_data = instances.features.get_all_data() indice = [] sparse_value = [] data_shape = instances.features.get_shape() for col_idx, col_value in all_data: if col_idx in transform_cols_idx: if col_value in abnormal_list: indice.append(col_idx) sparse_value.append(col_value) continue # Maybe it is because missing value add in sparse value, but col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) indice.append(col_idx) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] sparse_value.append(woe_value) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector else: features = instances.features assert isinstance(features, np.ndarray) transform_cols_idx_set = set(transform_cols_idx) for col_idx, col_value in enumerate(features): if col_idx in transform_cols_idx_set: if col_value in abnormal_list: features[col_idx] = col_value continue col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] features[col_idx] = woe_value instances.features = features return instances return data_instances.mapValues(convert)
def _static_sums(self): """ Statics sum, sum_square, max_value, min_value, so that variance is available. """ is_sparse = data_overview.is_sparse_data(self.data_instances) partition_cal = functools.partial(self.static_in_partition, cols_index=self.cols_index, summary_statistics=copy.deepcopy(self.summary_statistics), is_sparse=is_sparse) self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \ reduce(lambda x, y: self.copy_merge(x, y)) # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics) self.finish_fit_statics = True
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ...] # Other features """ header = data_overview.get_header(data_instances) self._default_setting(header) is_sparse = data_overview.is_sparse_data(data_instances) if is_sparse: raise RuntimeError( "Bucket Binning method has not supported sparse data yet.") # self._init_cols(data_instances) statistics = MultivariateStatisticalSummary( data_instances, self.bin_inner_param.bin_indexes, abnormal_list=self.abnormal_list) max_dict = statistics.get_max() min_dict = statistics.get_min() for col_name, max_value in max_dict.items(): min_value = min_dict.get(col_name) split_points = [] L = (max_value - min_value) / self.bin_num for k in range(self.bin_num - 1): s_p = min_value + (k + 1) * L split_points.append(s_p) split_points.append(max_value) # final_split_points[col_name] = split_point self.bin_results.put_col_split_points(col_name, split_points) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def check_containing_missing_value(data_instances): is_sparse = data_overview.is_sparse_data(data_instances) def _sparse_check(instance): result = set() sparse_data = instance.features.get_all_data() for col_idx, col_value in sparse_data: if np.isnan(col_value): result.add(col_idx) return result if is_sparse: has_missing_value = data_instances.mapValues(_sparse_check).reduce( lambda a, b: a.union(b) ) else: has_missing_value = data_instances.mapValues(lambda x: x.features).reduce(operator.add) has_missing_value = {idx for idx, value in enumerate(has_missing_value) if np.isnan(value)} return has_missing_value
def convert_feature_to_woe(self, data_instances): is_sparse = data_overview.is_sparse_data(data_instances) schema = data_instances.schema if is_sparse: f = functools.partial(self._convert_sparse_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='woe') new_data = data_instances.mapValues(f) else: f = functools.partial(self._convert_dense_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='woe') new_data = data_instances.mapValues(f) new_data.schema = schema return new_data
def fit_summary(self, data_instances, is_sparse=None): if is_sparse is None: is_sparse = data_overview.is_sparse_data(data_instances) LOGGER.debug(f"is_sparse: {is_sparse}") f = functools.partial(self.feature_summary, params=self.params, abnormal_list=self.abnormal_list, cols_dict=self.bin_inner_param.bin_cols_map, header=self.header, is_sparse=is_sparse) summary_dict_table = data_instances.mapReducePartitions( f, self.copy_merge) # summary_dict = dict(summary_dict.collect()) if is_sparse: total_count = data_instances.count() summary_dict_table = summary_dict_table.mapValues( lambda x: x.set_total_count(total_count)) return summary_dict_table
def get_data_bin(self, data_instances, split_points=None): """ Apply the binning method Parameters ---------- data_instances : DTable The input data split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ...] # Other features Returns ------- data_bin_table : DTable. Each element represent for the corresponding bin number this feature belongs to. e.g. it could be: [{'x1': 1, 'x2': 5, 'x3': 2} ... ] """ # self._init_cols(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) if split_points is None: split_points = self.fit_split_points(data_instances) f = functools.partial(self.bin_data, split_points=split_points, cols_dict=self.bin_inner_param.bin_cols_map, header=self.header, is_sparse=is_sparse) data_bin_dict = data_instances.mapValues(f) return data_bin_dict
def empty_column_detection(data_instance): contains_empty_columns = False lost_feat = [] is_sparse = data_overview.is_sparse_data(data_instance) if is_sparse: raise ValueError( 'sparse format empty column detection is not supported for now') map_func = functools.partial(column_gathering, ) map_rs = data_instance.applyPartitions(map_func) reduce_rs = map_rs.reduce(merge_column_sets) # transform col index to col name reduce_rs = np.array(data_instance.schema['header'])[list(reduce_rs)] reduce_rs = set(reduce_rs) if reduce_rs != set(data_instance.schema['header']): lost_feat = list( set(data_instance.schema['header']).difference(reduce_rs)) contains_empty_columns = True if contains_empty_columns: raise ValueError('column(s) {} contain(s) no values'.format(lost_feat))
def init_bucket(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) init_bucket_param = copy.deepcopy(self.params) init_bucket_param.bin_num = self.optimal_param.init_bin_nums if self.optimal_param.init_bucket_method == consts.QUANTILE: init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False) else: init_binning_obj = BucketBinning(params=init_bucket_param) init_binning_obj.set_bin_inner_param(self.bin_inner_param) init_split_points = init_binning_obj.fit_split_points(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) bucket_dict = dict() for col_name, sps in init_split_points.items(): # bucket_list = [bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) # for idx, sp in enumerate(sps)] bucket_list = [] for idx, sp in enumerate(sps): bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) if idx == 0: bucket.left_bound = -math.inf bucket.set_left_neighbor(None) else: bucket.left_bound = sps[idx - 1] bucket.event_total = self.event_total bucket.non_event_total = self.non_event_total bucket_list.append(bucket) bucket_list[-1].set_right_neighbor(None) bucket_dict[col_name] = bucket_list LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, " f"length of list: {len(bucket_list)}") # bucket_table = data_instances.mapPartitions2(convert_func) # bucket_table = bucket_table.reduce(self.merge_bucket_list, key_func=lambda key: key[1]) from fate_arch.common.versions import get_eggroll_version version = get_eggroll_version() if version.startswith('2.0'): convert_func = functools.partial(self.convert_data_to_bucket_old, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) summary_dict = data_instances.mapPartitions(convert_func, use_previous_behavior=False) # summary_dict = summary_dict.reduce(self.copy_merge, key_func=lambda key: key[1]) from federatedml.util.reduce_by_key import reduce bucket_table = reduce(summary_dict, self.merge_bucket_list, key_func=lambda key: key[1]) elif version.startswith('2.2'): convert_func = functools.partial(self.convert_data_to_bucket, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) bucket_table = data_instances.mapReducePartitions(convert_func, self.merge_bucket_list) bucket_table = dict(bucket_table.collect()) else: raise RuntimeError(f"Cannot recognized eggroll version: {version}") for k, v in bucket_table.items(): LOGGER.debug(f"[feature] {k}, length of list: {len(v)}") LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) bucket_table = [(k, v) for k, v in bucket_table.items()] LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions) return bucket_table
def _find_kth_mode(self, data_instances, k): """ Find 1/k mode. If there is a mode that number of which is larger than 1/k of total nums, return this mode and its percentage. If there is not, return None, None. Parameters ---------- data_instances: DTable Original data k: int """ is_sparse = is_sparse_data(data_instances) def find_mode_candidate(instances, select_cols): """ Find at most k - 1 mode candidates. Parameters ---------- instances: Data generator Original data k: int select_cols: list Indicates columns that need to be operated. is_sparse: bool Whether input data format is sparse Returns ------- all_candidates: dict Each key is col_index and value is a list that contains mode candidates. """ all_candidates = {} for col_index in select_cols: all_candidates[col_index] = {} for _, instant in instances: for col_index in select_cols: candidate_dict = all_candidates[col_index] if is_sparse: feature_value = instant.features.get_data(col_index, 0) else: feature_value = instant.features[col_index] if isinstance(feature_value, float): feature_value = round(feature_value, 8) if feature_value in candidate_dict: candidate_dict[feature_value] += 1 elif len(candidate_dict) < k - 1: candidate_dict[feature_value] = 1 else: to_delete_col = [] for key in candidate_dict: candidate_dict[key] -= 1 if candidate_dict[key] == 0: to_delete_col.append(key) for d_k in to_delete_col: del candidate_dict[d_k] for col_index, candidate_dict in all_candidates.items(): candidate_dict = {key: 0 for key, _ in candidate_dict.items()} all_candidates[col_index] = candidate_dict return all_candidates def merge_mode_candidate(d1, d2): assert len(d1) == len(d2) for col_idx, d in d1.items(): d.update(d2[col_idx]) return d1 def merge_candidates_num(candi_1, candi_2): assert len(candi_1) == len(candi_2) for col_idx, candidate_dict in candi_1.items(): candi_dict_2 = candi_2[col_idx] for feature_value, num in candi_dict_2.items(): if feature_value in candidate_dict: candidate_dict[feature_value] += num else: candidate_dict[feature_value] = num return candi_1 def static_candidates_num(instances, select_cols, all_candidates): """ Static number of candidates Parameters ---------- instances: Data generator Original data select_cols: list Indicates columns that need to be operated. all_candidates: dict Each key is col_index and value is a list that contains mode candidates. """ for _, instant in instances: for col_index in select_cols: candidate_dict = all_candidates[col_index] if is_sparse: feature_value = instant.features.get_data( col_index, NoneType()) else: feature_value = instant.features[col_index] if isinstance(feature_value, float): feature_value = round(feature_value, 8) if feature_value in candidate_dict: candidate_dict[feature_value] += 1 # mode_result = {} # for col_index, candidate_dict in all_candidates.items(): # feature_value, nums = sorted(candidate_dict.items(), key=operator.itemgetter(1), reverse=False)[0] # mode_result[col_index] = (feature_value, nums) return all_candidates find_func = functools.partial( find_mode_candidate, select_cols=self.selection_properties.select_col_indexes) all_candidates = data_instances.mapPartitions(find_func).reduce( merge_mode_candidate) static_func = functools.partial( static_candidates_num, select_cols=self.selection_properties.select_col_indexes, all_candidates=all_candidates) mode_candidate_statics = data_instances.mapPartitions( static_func).reduce(merge_candidates_num) result = {} for col_index, candidate_dict in mode_candidate_statics.items(): if len(candidate_dict) > 0: res = sorted(candidate_dict.items(), key=operator.itemgetter(1), reverse=True)[0] else: res = None result[col_index] = res return result