def _encode_categorical(self, X, y): """TODO""" # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # Separate categorical features from continuous features X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_categorical = X[:, self.categorical_features_].copy() X_minority = X_continuous[np.flatnonzero(y == class_minority)] if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_encoded = X_ohe.toarray() X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 if self._issparse: X_encoded = np.hstack([X_continuous.toarray(), X_ohe.toarray()]) else: X_encoded = np.hstack([X_continuous, X_ohe.toarray()]) return X_encoded
def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc']) X_minority = safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) if sparse.issparse(X): if X.format == 'csr': _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != 'object': dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore', dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2) X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr') X_resampled, y_resampled = super(SMOTENC, self)._fit_resample( X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( (X_resampled[:, :self.continuous_features_.size], X_res_cat_dec), format='csr' ) else: X_resampled = np.hstack( (X_resampled[:, :self.continuous_features_.size].toarray(), X_res_cat_dec) ) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_)) ) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled
def _mutual_proximity_gammai_sparse(S, sample_size=0, train_set_ind=None, verbose=0, log=None, mv=None, n_jobs=-1): """MP gaussi for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_gaussi() """ self_value = 1. # similarity matrix # mean, variance WITHOUT zero values (missing values), ddof=1 if S.diagonal().max() != self_value or S.diagonal().min() != self_value: raise ValueError("Self similarities must be 1.") S_param = S[train_set_ind] if mv is None: # mean, variance WITH zero values: from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 # @UnresolvedImport mu, va = csr_mean_variance_axis0(S_param) elif mv == 0: # mean, variance WITHOUT zero values (missing values) # the -1 accounts for self sim that must be excluded from the calc mu = np.array((S_param.sum(0) - 1) / (S_param.getnnz(0) - 1)).ravel() E2 = mu**2 X = S_param.copy() X.data **= 2 n_x = (X.getnnz(0) - 1) E1 = np.array((X.sum(0) - 1) / n_x).ravel() del X # for an unbiased sample variance va = n_x / (n_x - 1) * (E1 - E2) del E1 else: log.error("MP only supports missing values as zeros.", flush=True) raise ValueError("mv must be None or 0.") A = (mu**2) / va B = va / mu del mu, va A[A < 0] = np.nan B[B <= 0] = np.nan S_mp = lil_matrix(S.shape, dtype=np.float32) n = S.shape[0] # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() else: NUMBER_OF_PROCESSES = n_jobs tasks = [] batches = _get_weighted_batches(n, NUMBER_OF_PROCESSES) # create jobs for idx, batch in enumerate(batches): matrix = S tasks.append((_partial_mp_gammai_sparse, (batch, matrix, idx, n, A, B, verbose))) task_queue = mp.Queue() done_queue = mp.Queue() for task in tasks: task_queue.put(task) # start jobs processes = [] for i in range(NUMBER_OF_PROCESSES): processes.append(mp.Process(target=_worker, args=(task_queue, done_queue))) processes[i].start() # collect results for i in range(len(tasks)): rows, Dmp_part = done_queue.get() task_queue.put('STOP') if verbose: log.message("Merging submatrix {} (rows {}..{})". format(i, rows[0], rows[-1]), flush=True) if rows.size > 0: row_slice = slice(rows[0], rows[-1]+1) else: # for very small matrices, some batches might be empty row_slice = slice(0, 0) S_mp[row_slice] = Dmp_part for p in processes: p.join() S_mp = S_mp.tolil() if verbose: log.message("Mirroring distance matrix", flush=True) S_mp += S_mp.T if verbose: log.message("Setting self distances", flush=True) for i in range(S_mp.shape[0]): S_mp[i, i] = self_value if verbose: log.message("Converting to CSR matrix", flush=True) return S_mp.tocsr()
def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform(X_categorical.toarray( ) if sparse.issparse(X_categorical) else X_categorical) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_minority_encoded = _safe_indexing( X_ohe.toarray(), np.flatnonzero(y == class_minority)) X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, :self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack(( X_resampled[:, :self.continuous_features_.size].toarray(), X_res_cat_dec, )) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_))) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled
def _mutual_proximity_gaussi_sparse(S, sample_size, train_set_ind, verbose, log, mv, n_jobs): """MP gaussi for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_gaussi() """ n = S.shape[0] self_value = 1. # similarity matrix if mv is None: # mean, variance WITH zero values: from sklearn.utils.sparsefuncs_fast \ import csr_mean_variance_axis0 # @UnresolvedImport mu, va = csr_mean_variance_axis0(S[train_set_ind]) elif mv == 0: # mean, variance WITHOUT zero values (missing values) mu = np.array((S.sum(0) - 1) / (S.getnnz(0) - 1)).ravel() X = S.copy() X.data **= 2 E1 = np.array((X.sum(0) - 1) / (X.getnnz(0) - 1)).ravel() del X va = E1 - mu**2 del E1 else: log.error("MP only supports missing values as zeros.", flush=True) raise ValueError("mv must be None or 0.") sd = np.sqrt(va) del va Dmp = lil_matrix(S.shape) # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() else: NUMBER_OF_PROCESSES = n_jobs tasks = [] batches = _get_weighted_batches(n, NUMBER_OF_PROCESSES) for idx, batch in enumerate(batches): matrix = S tasks.append((_partial_mp_gaussi_sparse, (batch, matrix, idx, n, mu, sd, verbose))) task_queue = mp.Queue() done_queue = mp.Queue() for task in tasks: task_queue.put(task) processes = [] for i in range(NUMBER_OF_PROCESSES): processes.append(mp.Process(target=_worker, args=(task_queue, done_queue))) processes[i].start() for i in range(len(tasks)): # @UnusedVariable rows, Dmp_part = done_queue.get() task_queue.put('STOP') if verbose: log.message("Merging submatrix {} (rows {}..{})". format(i, rows[0], rows[-1]), flush=True) if rows.size > 0: rows_slice = slice(rows[0], rows[-1]+1) else: rows_slice = slice(0, 0) Dmp[rows_slice, :] = Dmp_part for p in processes: p.join() Dmp = Dmp.tolil() if verbose: log.message("Mirroring distance matrix", flush=True) Dmp += Dmp.T if verbose: log.message("Setting self distances", flush=True) for i in range(Dmp.shape[0]): Dmp[i, i] = self_value if verbose: log.message("Converting to CSR matrix", flush=True) return Dmp.tocsr()