def _spearman_r(a, b, weights, axis, skipna): """ndarray implementation of scipy.stats.spearmanr. Parameters ---------- a : ndarray Input array. b : ndarray Input array. axis : int The axis to apply the correlation along. weights : ndarray Input array. skipna : bool If True, skip NaNs when computing function. Returns ------- res : ndarray Spearmanr's correlation coefficient. See Also -------- scipy.stats.spearmanr """ if skipna: a, b, weights = _match_nans(a, b, weights) _a = bn.nanrankdata(a, axis=axis) _b = bn.nanrankdata(b, axis=axis) return _pearson_r(_a, _b, weights, axis, skipna)
def getStatsTS(X, Y, quantile=10, window=500, minCnt=250): """ X: Input factor, shape should be 40320*1082 Y: Existing factor, price Calculate the return of 10, 20 ,30 by Standardized Return_i = (Price_t+i-Price_t)/Price_t/i """ def calcFwdRet(price, window=30): """ """ fwd = np.roll(price, -window, axis=0) fwd[-window:, :] = np.nan return fwd / price - 1 print('Now Calculating IC and IR matrix, start counting...') t0 = time.time() X = np.asarray(X) Y = np.asarray(Y) Y_ = np.zeros(Y.shape) for i in range(len(Y) - 30): for j in range(Y.shape[1]): Y_[i, j] = (Y[i + 30, j] - Y[i, j]) / Y[i, j] / 30 Y = Y_ if X.shape != Y.shape: print(X.shape) print(Y.shape) raise N = len(X) IC = np.zeros((N, )) bottom = 1.0 / quantile top = 1 - bottom # ts rank X = bn.move_rank(X, window=window, min_count=minCnt, axis=0) print(np.isnan(X).sum()) # norm to [0, 1] X = 0.5 * (X + 1) # get common data X = np.where((~np.isnan(X) & (~np.isnan(Y))), X, np.nan) Y = np.where((~np.isnan(X) & (~np.isnan(Y))), Y, np.nan) # cross-rank Y Y_rk = bn.nanrankdata(Y, axis=1) Y_rk /= bn.nanmax(Y_rk, axis=1)[:, np.newaxis] # ls LS = np.nanmean(np.where(X > top, Y, np.nan), axis=1) \ - np.nanmean(np.where(X < bottom, Y, np.nan), axis=1) # Loop for ii in range(N): IC[ii] = np.corrcoef(X[ii][~np.isnan(X[ii])], Y_rk[ii][~np.isnan(Y_rk[ii])])[0, 1] t1 = time.time() print("total time used for IC and LS matrix calculation is:", (t1 - t0)) return IC, LS
def gev_from_samples(arr_ams, n_sample, shape_param): """draw sample with replacement and find GEV parameters using the Probability-Weighted Moments method. """ assert arr_ams.ndim == 1 # Remove NaN arr_ams = arr_ams[np.isfinite(arr_ams)] # Records length is exclusive of NaN n_obs = len(arr_ams) # print('n_obs', n_obs) # Random sampling with replacement of indices sampling_idx = helper.get_sampling_idx(n_sample, n_obs) # Draw samples. Add dimension n_sample in first position. arr_samples = arr_ams[sampling_idx] # print(arr_samples.shape) ax_year = 1 # rank samples rank = bottleneck.nanrankdata(arr_samples, axis=ax_year).astype(fscalar) # fit distribution. ev_params is a tuple of ndarrays. ecdf = ecdf_jit(rank, n_obs) gev_pwm_njit = nb.njit(gev_pwm) ev_params = gev_pwm_njit(arr_samples, ecdf, n_obs, ax_year, shape=shape_param) # Add one axis. Changes shape to (ev_params, samples). return np.array(ev_params)
def hsa(gefs_sprd, subset, debug=False): '''Standardizes, sets min and max between -1 and 1, and takes the arctanh to derive a "normal" distribution to ascribe more statistical relevance to the zscore values. Known as historical spread anomaly, or HSA.''' try: gefs_sprd = gefs_sprd.rename({'time': 'fhour'}) except: pass try: gefs_sprd = gefs_sprd.assign_coords(fhour=subset.fhour) except: pass subset_vals = (gefs_sprd['Pressure'] - subset.mean( 'time', skipna=True)) / subset.std('time', skipna=True) new_stacked = xr.concat( [subset.drop('timestr').to_dataset(), gefs_sprd.expand_dims('time')], 'time') percentile = bottleneck.nanrankdata( new_stacked['Pressure'], axis=0) / np.count_nonzero( ~np.isnan(new_stacked['Pressure'][:, 0, 0, 0])) perc_ds = xr.Dataset( data_vars=dict(spread_percentile=(["fhour", "lat", "lon"], percentile[-1])), coords=dict(lon=new_stacked.lon.values, lat=new_stacked.lat.values, fhour=new_stacked.fhour.values), attrs=dict(description="Spread percentile based on reforecast\ of similar mean anomalies by gridpoint."), ) if debug: return subset_vals # subset_vals = (0.99-(-0.99))*(subset_vals-subset_vals.min(['lat','lon']))/(subset_vals.max(['lat','lon'])-subset_vals.min(['lat','lon'])) + -0.99 # subset_vals = np.arctanh(subset_vals) return subset_vals, perc_ds
def _nanrankdata(self, X, axis=1): """ Replaces bottleneck's nanrankdata with scipy and numpy alternative. """ #ranks = sp.stats.mstats.rankdata(np.ma.masked_invalid(X), axis=axis) ranks = bn.nanrankdata(X, axis=axis) ranks[ranks == 0] = np.nan return ranks
def rank(data): """Rank normalize data Rank standardize data to make nonparametric Arguments: data {np.array} -- 2-D coexpression network Returns: np.array -- Rank normalized between 0 and 1 array """ orig_shape = data.shape data = bottleneck.nanrankdata(data) - 1 return (data / np.sum(~np.isnan(data))).reshape(orig_shape)
def compute_aurocs_default(sum_in, study_ct_uniq, pheno, study_col, ct_col, compute_p): """Helper function to compute AUROCs from votes matrix of cells Arguments: sum_in {np.ndarray} -- votes matrix, cells x cell types votes study_ct_uniq {vector} -- vector of study_id|cell_type labels pheno {pd.DataFrame} -- dataframe wtih study_ct, study_id and ct_col for all cells study_col {str} -- String name of study_col in pheno ct_col {str} -- Stirng name of cell type col in pheno Returns: pd.DataFrame -- ROCs for cell type x cell type labels """ cell_nv = pd.DataFrame(index=study_ct_uniq) if compute_p: cell_p = pd.DataFrame(index=study_ct_uniq) for ct in study_ct_uniq: predicts_tmp = sum_in.copy() study, cellT = (pheno[pheno.study_ct == ct].drop_duplicates()[[ study_col, ct_col ]].values[0]) # Don't want to split string in case of charcter issues slicer = pheno[study_col] == study pheno2 = pheno[slicer] predicts_tmp = predicts_tmp[slicer] predicts_tmp = bottleneck.nanrankdata(predicts_tmp, axis=0) filter_mat = np.zeros_like(predicts_tmp) filter_mat[pheno2.study_ct == ct] = 1 predicts_tmp[filter_mat == 0] = 0 n_p = bottleneck.nansum(filter_mat, axis=0) nn = filter_mat.shape[0] - n_p p = bottleneck.nansum(predicts_tmp, axis=0) roc = (p / n_p - (n_p + 1) / 2) / nn cell_nv[ct] = roc if compute_p: U = roc * n_p * nn Z = (np.abs(U - (n_p * nn / 2))) / np.sqrt(n_p * nn * (n_p + nn + 1) / 12) P = stats.norm.sf(Z) cell_p[ct] = P del predicts_tmp, filter_mat gc.collect() if compute_p: return cell_nv, cell_p return cell_nv
def _fit(self, X, y): # check input params self._check_params(X, y) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 # holds the decision about each feature: # 0 - default state = tentative in original code # 1 - accepted in original code # -1 - rejected in original code dec_reg = np.zeros(n_feat, dtype=np.int) # counts how many times a given feature was more important than # the best of the shadow features hit_reg = np.zeros(n_feat, dtype=np.int) # these record the history of the iterations imp_history = np.zeros(n_feat, dtype=np.float) sha_max_history = [] # set n_estimators if self.n_estimators != 'auto': self.estimator.set_params(n_estimators=self.n_estimators) # main feature selection loop while np.any(dec_reg == 0) and _iter < self.max_iter: # find optimal number of trees and depth if self.n_estimators == 'auto': # number of features that aren't rejected not_rejected = np.where(dec_reg >= 0)[0].shape[0] n_tree = self._get_tree_num(not_rejected) self.estimator.set_params(n_estimators=n_tree) # make sure we start with a new tree in each iteration rnd_st = np.random.randint(1,1e6,1)[0] self.estimator.set_params(random_state=rnd_st) # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) # get the threshold of shadow importances we will use for rejection imp_sha_max = np.percentile(cur_imp[1], self.perc) # record importance history sha_max_history.append(imp_sha_max) imp_history = np.vstack((imp_history, cur_imp[0])) # register which feature is more imp than the max of shadows hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max) # based on hit_reg we check if a feature is doing better than # expected by chance dec_reg = self._do_tests(dec_reg, hit_reg, _iter) # print out confirmed features if self.verbose > 0 and _iter < self.max_iter: self._print_results(dec_reg, _iter, 0) if _iter < self.max_iter: _iter += 1 # we automatically apply R package's rough fix for tentative ones confirmed = np.where(dec_reg == 1)[0] tentative = np.where(dec_reg == 0)[0] # ignore the first row of zeros tentative_median = np.median(imp_history[1:, tentative], axis=0) # which tentative to keep tentative_confirmed = np.where(tentative_median > np.median(sha_max_history))[0] tentative = tentative[tentative_confirmed] # basic result variables self.n_features_ = confirmed.shape[0] self.support_ = np.zeros(n_feat, dtype=np.bool) self.support_[confirmed] = 1 self.support_weak_ = np.zeros(n_feat, dtype=np.bool) self.support_weak_[tentative] = 1 # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) # tentative variables are rank 2 self.ranking_[tentative] = 2 # selected = confirmed and tentative selected = np.hstack((confirmed, tentative)) # all rejected features are sorted by importance history not_selected = np.setdiff1d(np.arange(n_feat), selected) # large importance values should rank higher = lower ranks -> *(-1) imp_history_rejected = imp_history[1:, not_selected] * -1 # calculate ranks in each iteration, then median of ranks across feats iter_ranks = bn.nanrankdata(imp_history_rejected, axis=1) #iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) rank_medians = np.nanmedian(iter_ranks, axis=0) ranks = bn.nanrankdata(rank_medians) #ranks = self._nanrankdata(rank_medians, axis=0) # update rank for not_selected features if not_selected.shape[0] > 0: # set smallest rank to 3 if there are tentative feats if tentative.shape[0] > 0: ranks = ranks - np.min(ranks) + 3 else: # and 2 otherwise ranks = ranks - np.min(ranks) + 2 self.ranking_[not_selected] = ranks # notify user if self.verbose > 0: self._print_results(dec_reg, _iter, 1) return self
def rank_alpha(self, alpha): # 将alpha因子转化为 01之间的排序 rankAlpha = bk.nanrankdata(alpha, axis=1) rankAlpha = (rankAlpha.T / bk.nanmax(rankAlpha, axis=1)).T return rankAlpha
def time_nanrankdata(self, dtype, shape): bn.nanrankdata(self.arr)
def ranking(x, axis=0, norm='-1,1'): """ Normalized ranking treating NaN as missing and averaging ties. Parameters ---------- x : ndarray Data to be ranked. axis : {int, None} optional Axis to rank over. Default axis is 0. norm: str, optional A string that specifies the normalization: ========== ================================================ '0,N-1' Zero to N-1 ranking '-1,1' Scale zero to N-1 ranking to be between -1 and 1 'gaussian' Rank data then scale to a Gaussian distribution ========== ================================================ The default ranking is '-1,1'. Returns ------- idx : ndarray The ranked data.The dtype of the output is always np.float even if the dtype of the input is int. Notes ----- If there is only one non-NaN value along the given axis, then that value is set to the midpoint of the specified normalization method. For example, if the input is array([1.0, nan]), then 1.0 is set to zero for the '-1,1' and 'gaussian' normalizations and is set to 0.5 (mean of 0 and 1) for the '0,N-1' normalization. For '0,N-1' normalization, note that N is x.shape[axis] even in there are NaNs. That ensures that when ranking along the columns of a 2d array, for example, the output will have the same min and max along all columns. """ if axis is None: ranked_x = ranking(x.reshape(-1), norm=norm) return ranked_x.reshape(*x.shape) ax = axis if ax < 0: # This converts a negative axis to the equivalent positive axis ax = range(x.ndim)[ax] masknan = np.isnan(x) countnan = np.expand_dims(masknan.sum(ax), ax) countnotnan = x.shape[ax] - countnan idx = bn.nanrankdata(x, ax) idx -= 1 if norm == '-1,1': idx /= (countnotnan - 1) idx *= 2 idx -= 1 middle = 0.0 elif norm == '0,N-1': idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1)) middle = (idx.shape[ax] + 1.0) / 2.0 - 1.0 elif norm == 'gaussian': global ndtri if ndtri is None: try: from scipy.special import ndtri except ImportError: msg = "SciPy required for `gaussian` normalization" raise ImportError(msg) idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1)) idx = ndtri((idx + 1.0) / (x.shape[ax] + 1.0)) middle = 0.0 else: msg = "norm must be '-1,1', '0,N-1', or 'gaussian'." raise ValueError(msg) np.putmask(idx, (countnotnan == 1) * (~masknan), middle) return idx
def score_default( X, S, C, node_degree_normalization, means=True ): """Compute ROCs according to the default procedure Default procedure computes ranked cell similarity matrix and then uses neighbor voting Arguments: X {array} -- Array (sparse or dense) of geneset x cells S {vector} -- Study labels, length cells C {vector} -- Cell type labels, legnth cells node_degree_normalization {bool} -- Flag for whether to normalize votes by node degree Returns: pd.Series -- Series containing AUROCs for each cell type for the given gene set """ nw = create_nw_spearman(X.T) nw = (nw + nw.T) / 2 cell_labels = design_matrix(C) x1 = cell_labels.shape[1] x2 = cell_labels.shape[0] studies = np.unique(S) exp_cols = np.repeat(studies, x1) test_cell_labels = np.tile(cell_labels.values, studies.shape[0]) for study in studies: # Hide testing labels d = np.where(study == S)[0] a = np.where(study == exp_cols)[0] for i in a: test_cell_labels[d, i] = 0 predicts = nw @ test_cell_labels if node_degree_normalization: sum_all = np.sum(nw, axis=0) predicts /= sum_all[:, None] predicts[test_cell_labels == 1] = np.nan exp_cols = np.repeat(studies, x1) filter_mat = np.tile(cell_labels.values, studies.shape[0]) for study in studies: mask = (study != S).astype(float)[:, None] @ (study == exp_cols).astype(float)[ :, None ].T mask = mask.astype(bool) filter_mat[mask] = np.nan predicts[mask] = np.nan predicts = bottleneck.nanrankdata(np.abs(predicts), axis=0) predicts[filter_mat == 0] = 0 n_p = bottleneck.nansum(filter_mat, axis=0) n_n = bottleneck.nansum((filter_mat == 0).astype(float), axis=0) p = bottleneck.nansum(predicts, axis=0) rocNV = (p / n_p - (n_p + 1) / 2) / n_n # C array opposite of F in R rocNV = rocNV.reshape([studies.shape[0], x1]).T if means: return pd.Series(bottleneck.nanmean(rocNV, axis=1), index=cell_labels.columns) else: return pd.DataFrame(rocNV, index=cell_labels.columns, columns=studies)
def Rank(A): ''' 横截面排序 ''' return bk.nanrankdata(A, axis=1)
def spearman_correlation(x, y, axis=0): import bottleneck as bn x_ranks = bn.nanrankdata(x, axis=axis) y_ranks = bn.nanrankdata(y, axis=axis) return pearson_correlation(x_ranks, y_ranks, axis=axis)
def _fit(self, X, y): # check input params self._check_params(X, y) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 # holds the decision about each feature: # 0 - default state = tentative in original code # 1 - accepted in original code # -1 - rejected in original code dec_reg = np.zeros(n_feat, dtype=np.int) # counts how many times a given feature was more important than # the best of the shadow features hit_reg = np.zeros(n_feat, dtype=np.int) # these record the history of the iterations imp_history = np.zeros(n_feat, dtype=np.float) sha_max_history = [] # set n_estimators if self.n_estimators != 'auto': self.estimator.set_params(n_estimators=self.n_estimators) # main feature selection loop while np.any(dec_reg == 0) and _iter < self.max_iter: # find optimal number of trees and depth if self.n_estimators == 'auto': # number of features that aren't rejected not_rejected = np.where(dec_reg >= 0)[0].shape[0] n_tree = self._get_tree_num(not_rejected) self.estimator.set_params(n_estimators=n_tree) # make sure we start with a new tree in each iteration rnd_st = np.random.randint(1, 1e6, 1)[0] self.estimator.set_params(random_state=rnd_st) # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) # get the threshold of shadow importances we will use for rejection imp_sha_max = np.percentile(cur_imp[1], self.perc) # record importance history sha_max_history.append(imp_sha_max) imp_history = np.vstack((imp_history, cur_imp[0])) # register which feature is more imp than the max of shadows hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max) # based on hit_reg we check if a feature is doing better than # expected by chance dec_reg = self._do_tests(dec_reg, hit_reg, _iter) # print out confirmed features if self.verbose > 0 and _iter < self.max_iter: self._print_results(dec_reg, _iter, 0) if _iter < self.max_iter: _iter += 1 # we automatically apply R package's rough fix for tentative ones confirmed = np.where(dec_reg == 1)[0] tentative = np.where(dec_reg == 0)[0] # ignore the first row of zeros tentative_median = np.median(imp_history[1:, tentative], axis=0) # which tentative to keep tentative_confirmed = np.where( tentative_median > np.median(sha_max_history))[0] tentative = tentative[tentative_confirmed] # basic result variables self.n_features_ = confirmed.shape[0] self.support_ = np.zeros(n_feat, dtype=np.bool) self.support_[confirmed] = 1 self.support_weak_ = np.zeros(n_feat, dtype=np.bool) self.support_weak_[tentative] = 1 # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) # tentative variables are rank 2 self.ranking_[tentative] = 2 # selected = confirmed and tentative selected = np.hstack((confirmed, tentative)) # all rejected features are sorted by importance history not_selected = np.setdiff1d(np.arange(n_feat), selected) # large importance values should rank higher = lower ranks -> *(-1) imp_history_rejected = imp_history[1:, not_selected] * -1 # calculate ranks in each iteration, then median of ranks across feats iter_ranks = bn.nanrankdata(imp_history_rejected, axis=1) #iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) rank_medians = np.nanmedian(iter_ranks, axis=0) ranks = bn.nanrankdata(rank_medians) #ranks = self._nanrankdata(rank_medians, axis=0) # update rank for not_selected features if not_selected.shape[0] > 0: # set smallest rank to 3 if there are tentative feats if tentative.shape[0] > 0: ranks = ranks - np.min(ranks) + 3 else: # and 2 otherwise ranks = ranks - np.min(ranks) + 2 self.ranking_[not_selected] = ranks # notify user if self.verbose > 0: self._print_results(dec_reg, _iter, 1) return self
def ranking(x, axis=0, norm='-1,1'): """ Normalized ranking treating NaN as missing and averaging ties. Parameters ---------- x : ndarray Data to be ranked. axis : {int, None} optional Axis to rank over. Default axis is 0. norm: str, optional A string that specifies the normalization: ========== ================================================ '0,N-1' Zero to N-1 ranking '-1,1' Scale zero to N-1 ranking to be between -1 and 1 'gaussian' Rank data then scale to a Gaussian distribution ========== ================================================ The default ranking is '-1,1'. Returns ------- idx : ndarray The ranked data.The dtype of the output is always np.float even if the dtype of the input is int. Notes ----- If there is only one non-NaN value along the given axis, then that value is set to the midpoint of the specified normalization method. For example, if the input is array([1.0, nan]), then 1.0 is set to zero for the '-1,1' and 'gaussian' normalizations and is set to 0.5 (mean of 0 and 1) for the '0,N-1' normalization. For '0,N-1' normalization, note that N is x.shape[axis] even in there are NaNs. That ensures that when ranking along the columns of a 2d array, for example, the output will have the same min and max along all columns. """ if axis is None: ranked_x = ranking(x.reshape(-1), norm=norm) return ranked_x.reshape(*x.shape) ax = axis if ax < 0: # This converts a negative axis to the equivalent positive axis ax = range(x.ndim)[ax] masknan = np.isnan(x) countnan = np.expand_dims(masknan.sum(ax), ax) countnotnan = x.shape[ax] - countnan idx = bn.nanrankdata(x, ax) idx -= 1 if norm == '-1,1': idx /= (countnotnan - 1) idx *= 2 idx -= 1 middle = 0.0 elif norm == '0,N-1': idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1)) middle = (idx.shape[ax] + 1.0) / 2.0 - 1.0 elif norm == 'gaussian': global ndtri if ndtri is None: try: from scipy.special import ndtri except ImportError: msg = "SciPy required for `gaussian` normalization" raise ImportError(msg) idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1)) idx = ndtri((idx + 1.0) / (x.shape[ax] + 1.0)) middle = 0.0 else: msg = "norm must be '-1,1', '0,N-1', or 'gaussian'." raise ValueError(msg) np.putmask(idx, (countnotnan==1)*(~masknan), middle) return idx
def spearman_rho(x, y, axis=-1): """Spearman rho. Pearson's r of on the rank. """ rank_x = bottleneck.nanrankdata(x, axis=axis) rank_y = bottleneck.nanrankdata(y, axis=axis) return pearson_r(rank_x, rank_y, axis=axis)
def _rank_first(x, y): """Concatenates x and y and returns the rank of the first element along the last axes""" xy = np.concatenate((x[..., np.newaxis], y), axis=-1) return bn.nanrankdata(xy, axis=-1)[..., 0]
def _spearman_correlation(x, y): x_ranks = bottleneck.nanrankdata(x, axis=-1) y_ranks = bottleneck.nanrankdata(y, axis=-1) return _pearson_correlation(x_ranks, y_ranks)