def transform_regression_label(self, data_inst): edge = self.split_points[-1] + 1 split_points_bin = self.split_points + [edge] bin_labels = data_inst.mapValues( lambda v: BaseBinning.get_bin_num(v.label, split_points_bin)) binned_y = [v for k, v in bin_labels.collect()] return binned_y
def convert(instances): if is_sparse: all_data = instances.features.get_all_data() indice = [] sparse_value = [] data_shape = instances.features.get_shape() for col_idx, col_value in all_data: if col_idx in transform_cols_idx: if col_value in abnormal_list: indice.append(col_idx) sparse_value.append(col_value) continue # Maybe it is because missing value add in sparse value, but col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) indice.append(col_idx) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] sparse_value.append(woe_value) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector else: features = instances.features assert isinstance(features, np.ndarray) transform_cols_idx_set = set(transform_cols_idx) for col_idx, col_value in enumerate(features): if col_idx in transform_cols_idx_set: if col_value in abnormal_list: features[col_idx] = col_value continue col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] features[col_idx] = woe_value instances.features = features return instances