예제 #1
0
파일: data_split.py 프로젝트: yubo1993/FATE
 def transform_regression_label(self, data_inst):
     edge = self.split_points[-1] + 1
     split_points_bin = self.split_points + [edge]
     bin_labels = data_inst.mapValues(
         lambda v: BaseBinning.get_bin_num(v.label, split_points_bin))
     binned_y = [v for k, v in bin_labels.collect()]
     return binned_y
예제 #2
0
        def convert(instances):
            if is_sparse:
                all_data = instances.features.get_all_data()
                indice = []
                sparse_value = []
                data_shape = instances.features.get_shape()
                for col_idx, col_value in all_data:
                    if col_idx in transform_cols_idx:
                        if col_value in abnormal_list:
                            indice.append(col_idx)
                            sparse_value.append(col_value)
                            continue
                        # Maybe it is because missing value add in sparse value, but
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        indice.append(col_idx)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        sparse_value.append(woe_value)
                    else:
                        indice.append(col_idx)
                        sparse_value.append(col_value)
                sparse_vector = SparseVector(indice, sparse_value, data_shape)
                instances.features = sparse_vector
            else:
                features = instances.features
                assert isinstance(features, np.ndarray)
                transform_cols_idx_set = set(transform_cols_idx)

                for col_idx, col_value in enumerate(features):
                    if col_idx in transform_cols_idx_set:
                        if col_value in abnormal_list:
                            features[col_idx] = col_value
                            continue
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        features[col_idx] = woe_value
                instances.features = features
            return instances