def summary(self, yname=None, xname=None, title=None, alpha=0.05): df = pd.DataFrame() df["Type"] = (["Mean"] * self.k_exog + ["Scale"] * self.k_scale + ["Smooth"] * self.k_smooth + ["SD"] * self.k_noise) df["coef"] = self.params try: df["std err"] = np.sqrt(np.diag(self.cov_params())) except Exception: df["std err"] = np.nan from scipy.stats.distributions import norm df["tvalues"] = df.coef / df["std err"] df["P>|t|"] = 2 * norm.sf(np.abs(df.tvalues)) f = norm.ppf(1 - alpha / 2) df["[%.3f" % (alpha / 2)] = df.coef - f * df["std err"] df["%.3f]" % (1 - alpha / 2)] = df.coef + f * df["std err"] df.index = self.model.data.param_names summ = summary2.Summary() if title is None: title = "Gaussian process regression results" summ.add_title(title) summ.add_df(df) return summ
def mannwhitneyu(x, y, use_continuity=True): """ Computes the Mann-Whitney rank test on samples x and y. Parameters ---------- x, y : array_like Array of samples, should be one-dimensional. use_continuity : bool, optional Whether a continuity correction (1/2.) should be taken into account. Default is True. Returns ------- u : float The Mann-Whitney statistics. prob : float One-sided p-value assuming a asymptotic normal distribution. Notes ----- Use only when the number of observation in each sample is > 20 and you have 2 independent samples of ranks. Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U. This test corrects for ties and by default uses a continuity correction. The reported p-value is for a one-sided hypothesis, to get the two-sided p-value multiply the returned p-value by 2. """ x = np.asarray(x) y = np.asarray(y) n1 = len(x) n2 = len(y) ranked = rankdata(np.concatenate((x, y))) rankx = ranked[0:n1] # get the x-ranks #ranky = ranked[n1:] # the rest are y-ranks u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - np.sum(rankx, axis=0) # calc U for x u2 = n1 * n2 - u1 # remainder is U for y bigu = max(u1, u2) smallu = min(u1, u2) #T = np.sqrt(tiecorrect(ranked)) # correction factor for tied scores T = tiecorrect(ranked) if T == 0: raise ValueError('All numbers are identical in amannwhitneyu') sd = np.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) if use_continuity: # normal approximation for prob calc with continuity correction z = (bigu - 0.5 - n1 * n2 / 2.0) / sd else: z = (bigu - n1 * n2 / 2.0) / sd # normal approximation for prob calc z *= int(u1 < u2) - int(u1 > u2) return z, norm.sf(abs(z)) #(1.0 - zprob(z))
def mannwhitneyu(x, y, use_continuity=True): """ Computes the Mann-Whitney rank test on samples x and y. Parameters ---------- x, y : array_like Array of samples, should be one-dimensional. use_continuity : bool, optional Whether a continuity correction (1/2.) should be taken into account. Default is True. Returns ------- u : float The Mann-Whitney statistics. prob : float One-sided p-value assuming a asymptotic normal distribution. Notes ----- Use only when the number of observation in each sample is > 20 and you have 2 independent samples of ranks. Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U. This test corrects for ties and by default uses a continuity correction. The reported p-value is for a one-sided hypothesis, to get the two-sided p-value multiply the returned p-value by 2. """ x = np.asarray(x) y = np.asarray(y) n1 = len(x) n2 = len(y) ranked = rankdata(np.concatenate((x,y))) rankx = ranked[0:n1] # get the x-ranks #ranky = ranked[n1:] # the rest are y-ranks u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0) # calc U for x u2 = n1*n2 - u1 # remainder is U for y bigu = max(u1,u2) smallu = min(u1,u2) #T = np.sqrt(tiecorrect(ranked)) # correction factor for tied scores T = tiecorrect(ranked) if T == 0: raise ValueError('All numbers are identical in amannwhitneyu') sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0) if use_continuity: # normal approximation for prob calc with continuity correction z = (bigu-0.5-n1*n2/2.0) / sd else: z = (bigu-n1*n2/2.0) / sd # normal approximation for prob calc z *= int(u1<u2)-int(u1>u2) return z, norm.sf(abs(z)) #(1.0 - zprob(z))
def mannwhitneyu(self, x, y, use_continuity=True): x = asarray(x) y = asarray(y) n1 = len(x) n2 = len(y) ranked = rankdata(np.concatenate((x, y))) rankx = ranked[0:n1] # get the x-ranks u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - np.sum(rankx, axis=0) # calc U for x u2 = n1 * n2 - u1 # remainder is U for y bigu = max(u1, u2) smallu = min(u1, u2) T = tiecorrect(ranked) sd = np.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) if use_continuity: # normal approximation for prob calc with continuity correction z = abs((bigu - 0.5 - n1 * n2 / 2.0) / sd) else: z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc p = norm.sf(z) return smallu, bigu, z, p
def info_score(X, nbhds, max_bins=float('inf'), entropy_normalize=False, fast_version=True, binom_scores=None, gene_bins=None, return_bin_info=False, verbose=True, n_tests='auto', model='wilcoxon', chunk_size=1000, **kwargs): """ :param X: sparse count matrix :param nbhds: list with indices of nearest neighbors for each obs in X, e.g. from kneighbors() in sklearn :param max_bins: Resolution at which global gene probabilities are computed. if inf, genes get their own probabilities. Otherwise, the unit interval is split into max_bins pieces and they are rounded. This makes it faster with little performance difference :param return_all: if True, will also return global and local gene probabilities :param binom_scores: pass in binomial scores for each gene/bin, if pre-computed. Allows saving for future iterations. :param gene_bins: pass in gene bins from previous run. Speeds up iteration :param return_bin_info: for iteration: keep information about gene bins and binomial probs. :param fast_version: if True, use matrix multiplication instead of iteration. Fast, but memory-intensive. :return: dense matrix of gene/cell weightings. """ if type(nbhds) is np.ndarray: nbhds = list(nbhds) k = len(nbhds[0]) # how many neighbors? if n_tests == 'auto': # determine by boostrapping n_tests = bootstrapped_ntests(X, k=k, model=model) wts = np.zeros((len(nbhds), X.shape[1])) # too big for large data # nbhd_counts = np.zeros(X.shape) # ditto # nbhd_sizes = [len(x) for x in NNs] # first compute frequencies of all genes: gene_probs = np.array((X > 0).sum(axis=0) / float(X.shape[0])).flatten() # frequencies of genes within neighborhoods nbhd_probs = np.zeros(X.shape) if model == 'ttest': data = np.ones(np.sum([len(x) for x in nbhds])) col_ind = [item for sublist in nbhds for item in sublist] row_ind = [i for i, sublist in enumerate(nbhds) for item in sublist] # sparse adjacency matrix of NN graph nn_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(nbhds), X.shape[0])) # get mean gene expressions within each neighborhood; this matrix may be less sparse mean_nbhd_exprs = (nn_matrix * X).astype('int').multiply(1/nn_matrix.sum(axis=1)).tocsr() vars = np.zeros((len(nbhds), X.shape[1])) for i in range(len(nbhds)): # gotta go cell by cell nbrs = np.array(nbhds[i]).flatten() gene_diffs = np.power((X[nbrs,:].todense()-mean_nbhd_exprs[i,:].todense()),2) # diffs of gene expression vars[i,:] = gene_diffs.mean(axis=0) vars = csr_matrix(vars) global_means = np.tile(X.mean(axis=0), (len(nbhds),1)) #sign is pos if mean is higher, negative otherwise. signs = 2*(mean_nbhd_exprs.todense() >= global_means).astype('int') - 1 global_var = np.tile(np.var(X.todense(), axis=0), (len(nbhds),1)) nobs_global = np.tile(X.shape[0], (len(nbhds), X.shape[1])) nobs_local = np.tile(k, (len(nbhds), X.shape[1])) wts = ttest_ind_from_stats(mean1=mean_nbhd_exprs.todense().flatten(), std1=np.array(np.sqrt(vars.todense()).flatten()), nobs1=np.array(nobs_local).flatten(), mean2=np.array(global_means).flatten(), std2=np.array(np.sqrt(global_var)).flatten(), nobs2=np.array(nobs_global).flatten()).pvalue.reshape((len(nbhds), X.shape[1])) np.nan_to_num(wts, copy=False, nan=1.0) # nans become pval 1 wts[wts==0] = sys.float_info.min # remove zeros if n_tests>1: # use FWER to correct for testing many genes wts_corrected = 1 - np.power(1 - wts, n_tests) wts_corrected[wts < 1e-10] = taylor_exp(wts[wts < 1e-10], n_tests) # more accurate wts = wts_corrected else: wts_corrected = wts wts = -1*np.log(wts) # convert to info np.nan_to_num(wts, copy=False, nan=1.0) # nans become pval 1 wts = np.multiply(signs, wts) # negative if underexpressed return(csr_matrix(wts)) #TODO TODO add signs elif model == 'wilcoxon': from scipy.stats import rankdata def fastRank(array): temp = array.argsort(axis=0) ranks = np.zeros(temp.shape) rows = temp.transpose().flatten() cols = np.repeat(np.arange(temp.shape[1]), temp.shape[0]) ranks[rows, cols] = np.array(list(np.arange(temp.shape[0])) * temp.shape[1]) return (ranks) # Wilcoxon rank sum testa #overall_exprs = X.todense().transpose().tolist() n_genes = X.shape[1] chunk_ends = [0] + list(np.arange(chunk_size, n_genes, chunk_size)) chunk_ends.append(n_genes) gene_idxs = np.array(list(range(n_genes))) gene_chunks = [np.array(gene_idxs[chunk_ends[i]:chunk_ends[i + 1]]) for i in range(len(chunk_ends) - 1)] wt_blocks = [] # list of sparse matrices to concatenate horizontally # make nbhd adjacency matrix data = np.ones(np.sum([len(x) for x in nbhds])) col_ind = [item for sublist in nbhds for item in sublist] row_ind = [i for i, sublist in enumerate(nbhds) for item in sublist] # sparse adjacency matrix of NN graph nn_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(nbhds), X.shape[0])) for i,chunk in enumerate(gene_chunks): if verbose: print('chunk {}/{}'.format(i+1,len(gene_chunks)), end='\r') X_chunk = X[:,chunk] wts = rankdata(X_chunk.todense(), axis=0) # gene rankings wts = nn_matrix @ wts # nbhd_ranksums; only want to store one big matrix n1 = k n2 = X_chunk.shape[0] - k sd = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) meanrank = n1 * n2 / 2.0 # #sign is pos if mean rank is higher than average, negative otherwise. # signs = 2*(wts >= meanrank).astype('int') - 1 wts = wts - ((n1 * (n1 + 1)) / 2.0) # calc U for x, u1 is_neg = (wts<meanrank) # remember where it was negative wts = np.maximum(wts, n1 * n2 - wts) # bigu wts = ((wts - meanrank) / sd) # z values wts = 2 * norm.sf(np.abs(wts)) #p values # # for i in range(len(nbhds)): # print('cell {}/{}'.format(i+1,len(nbhds)+1), end='\r') # #gene_exprs = X[nbhds[i],:].todense() # #all_exprs = np.vstack((gene_exprs, X.todense())) # # nbhd_ranks = gene_rankings[nbhds[i],:] # # ranksums = np.sum(nbhd_ranks, axis=0) # n1 = k # n2 = X.shape[0]-k # # # # ranks = fastRank(all_exprs.A) # # # # n1 = k # # n2 = X.shape[0] # # ranksums = np.sum(ranks[:k, :], axis=0) # u1 = ranksums - ((n1 * (n1 + 1)) / 2.0) # calc U for x # u2 = n1 * n2 - u1 # remainder is U for y # # sd = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) # meanrank = n1 * n2 / 2.0 # # bigu = np.maximum(u1, u2) # # wts[i,:] = bigu # # z = ((bigu - meanrank) / sd).flatten() # p = 2 * norm.sf(abs(z)) # wts[i,:] = p # # # gene_exprs = X[nbhds[i],:].todense().transpose().tolist() # list of gene expression vectors for each nbr # for j,local_expr in enumerate(gene_exprs): # global_expr = overall_exprs[j] # wts[i,j] = mannwhitneyu(x=local_expr, y=global_expr, alternative='two-sided', use_continuity=False)[1] if n_tests>1: # use FWER to correct for testing many genes wts[wts> 1e-10] = 1-np.power(1-wts[wts>1e-10], n_tests) wts[wts<=1e-10] = taylor_exp(wts[wts <= 1e-10], n_tests) # wts_corrected = 1 - np.power(1 - wts, n_tests) # wts_corrected[wts < 1e-10] = taylor_exp(wts[wts < 1e-10], n_tests) # more accurate # # wts = wts_corrected wts = -1*np.log(wts) # convert to info scores #sign them wts[is_neg] *= -1 #wts = np.multiply(signs, wts) wt_blocks.append(csr_matrix(wts)) return(hstack(wt_blocks)) elif model == 'log_likelihood': means = X.mean(axis=0) variances = sparse_vars(X, axis=0)/float(k) data = np.ones(np.sum([len(x) for x in nbhds])) col_ind = [item for sublist in nbhds for item in sublist] row_ind = [i for i, sublist in enumerate(nbhds) for item in sublist] # sparse adjacency matrix of NN graph nn_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(nbhds), X.shape[0])) nbhd_means = ((nn_matrix * X)/float(k)).todense() print(np.min(variances)) wts = 1 / 2. * np.log(2 * np.pi) + np.power((nbhd_means - means), 2) / (2 * variances) + np.log(variances)/2. signs = 2*(nbhd_means >= means).astype('int') - 1 wts = np.multiply(wts, signs) return(csr_matrix(wts)) elif model == 'binomial': X = csr_matrix((X > 0).astype('float')) # convert to sparse binarized matrix if binom_scores is None or gene_bins is None: if n_tests is None: n_tests = X.shape[1] # multi-correct per cell gene_bins, binom_scores = get_binom_scores(gene_probs, k, max_bins=max_bins, verbose=verbose, n_tests=n_tests, **kwargs) if fast_version: # compute significance of gene expression in each cell's neighborhood # first convert neighborhood to sparse matrix data = np.ones(np.sum([len(x) for x in nbhds])) col_ind = [item for sublist in nbhds for item in sublist] row_ind = [i for i, sublist in enumerate(nbhds) for item in sublist] # sparse adjacency matrix of NN graph nn_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(nbhds), X.shape[0])) # get gene expressions within each neighborhood; this matrix may be less sparse nbhd_exprs = (nn_matrix * X).astype('int').todense() # # extract locations and values of nonzero nbhd expressions. # rows, cols = nbhd_exprs.nonzero() # exprs = nbhd_exprs.data # apply binomial scores rows, cols = np.indices((len(nbhds), X.shape[1])) rows = rows.flatten() cols = cols.flatten() wts = binom_scores[gene_bins[cols], np.array(nbhd_exprs[rows, cols]).flatten()].reshape((len(nbhds), X.shape[1])) else: for i in range(len(nbhds)): if verbose: if i < len(nbhds) - 1: print('\r computing counts for cell {}/{}'.format(i, X.shape[0]), end=' ') else: print('\r computing counts for cell {}/{}'.format(i, X.shape[0]), end=' \n') nnbhd = X[nbhds[i], :] nbhd_size = len(nbhds[i]) nbhd_gene_counts = np.array((nnbhd > 0).sum(axis=0)).flatten() nbhd_probs[i, :] = nbhd_gene_counts / nbhd_size if max_bins < float('inf'): # look up the binomial score in the nearest bins # gene_scores = [binom_scores[gene_bins[j], count] for j, count in enumerate(nbhd_gene_counts)] gene_scores = binom_scores[gene_bins, nbhd_gene_counts] else: gene_scores = [binom_scores[j, count] for j, count in enumerate(nbhd_gene_counts)] wts[i, :] = gene_scores # expected_vals = nbhd_size * gene_probs # wts[i, :] = -1*np.log(gene_scores) * (2*(nbhd_gene_counts > expected_vals)-1) if entropy_normalize: # divide each column by the entropy of the corresponding gene gene_entropies = -1 * (np.multiply(gene_probs, np.log(gene_probs)) + np.multiply((1 - gene_probs), np.log(1 - gene_probs))) gene_entropies[np.logical_not(np.isfinite(gene_entropies))] = float( 'inf') # zeros out non-expressed or everywhere-expressed genes wts = np.divide(wts, gene_entropies) # if return_all: # return (wts, gene_probs, nbhd_probs) wts = csr_matrix(wts) if model == 'binomial' and return_bin_info: # for iteration return (wts, gene_bins, binom_scores) else: return (wts)
def diffrank(adata, smp='groups', names='all', sig_level=0.05, correction='Bonferroni', log=False): """ Compare groups by ranking genes according to differential expression. Parameters ---------- adata : AnnData Annotated data matrix. smp : str, optional (default: 'exp_groups') Specify the name of the grouping to consider. names : str, list, np.ndarray, optional (default: 'all') Subset of categories - e.g. 'C1,C2,C3' or ['C1', 'C2', 'C3'] - to which comparison shall be restricted. If not provided all categories will be compared to all other categories. Writes to adata --------------- diffrank_zscores : np.ndarray Array of shape (number of comparisons) x (number of genes) storing the zscore of the each gene for each test. diffrank_rankings_names : np.ndarray of dtype str Array of shape (number of comparisons). Stores the labels for each comparison, for example "C1 vs. C2" when comparing category 'C1' with 'C2'. diffrank_rankings_geneidcs : np.ndarray Array of shape (number of comparisons) x (number of genes) storing gene indices that sort them according to decreasing absolute value of the zscore. """ # for clarity, rename variable groups_names = names groups_names, groups_masks = utils.select_groups(adata, groups_names, smp) adata['diffrank_groups'] = smp adata['diffrank_groups_names'] = groups_names X = adata.X if log: # TODO: treat negativity explicitly X = np.abs(X) X = np.log(X) / np.log(2) # loop over all masks and compute means, variances and sample numbers nr_groups = groups_masks.shape[0] nr_genes = X.shape[1] means = np.zeros((nr_groups, nr_genes)) vars = np.zeros((nr_groups, nr_genes)) ns = np.zeros(nr_groups, dtype=int) for imask, mask in enumerate(groups_masks): means[imask] = X[mask].mean(axis=0) vars[imask] = X[mask].var(axis=0) ns[imask] = np.where(mask)[0].size sett.m(0, 'testing', smp, groups_names, 'with sample numbers', ns) sett.m(2, 'means', means) sett.m(2, 'variances', vars) igroups_masks = np.arange(len(groups_masks), dtype=int) pairs = list(combinations(igroups_masks, 2)) pvalues_all = np.zeros((len(pairs), nr_genes)) zscores_all = np.zeros((len(pairs), nr_genes)) rankings_geneidcs = np.zeros((len(pairs), nr_genes), dtype=int) # each test provides a ranking of genes # we store the name of the ranking, i.e. the name of the test, # in the following list adata['diffrank_rankings_names'] = [] # test all combinations of groups against each other for ipair, (i, j) in enumerate(pairs): # z-scores denom = np.sqrt(vars[i] / ns[i] + vars[j] / ns[j]) zeros = np.flatnonzero(denom == 0) denom[zeros] = np.nan zscores = (means[i] - means[j]) / denom # the following is equivalent with # zscores = np.ma.masked_invalid(zscores) zscores = np.ma.masked_array(zscores, mask=np.isnan(zscores)) zscores_all[ipair] = zscores abs_zscores = np.abs(zscores) # p-values if False: pvalues = 2 * norm.sf(abs_zscores) # two-sided test pvalues = np.ma.masked_invalid(pvalues) sig_genes = np.flatnonzero(pvalues < 0.05 / zscores.shape[0]) pvalues_all[ipair] = pvalues # sort genes according to score ranking_geneidcs = np.argsort(abs_zscores)[::-1] # move masked values to the end of the index array masked = abs_zscores[ranking_geneidcs].mask len_not_masked = len(ranking_geneidcs[masked == False]) save_masked_idcs = np.copy(ranking_geneidcs[masked]) ranking_geneidcs[:len_not_masked] = ranking_geneidcs[masked == False] ranking_geneidcs[len_not_masked:] = save_masked_idcs # write to global rankings_genedics rankings_geneidcs[ipair] = ranking_geneidcs # names ranking_name = groups_names[i] + ' vs ' + groups_names[j] adata['diffrank_rankings_names'].append(ranking_name) if False: adata['diffrank_pvalues'] = -np.log10(pvalues_all) adata['diffrank_zscores'] = zscores_all adata['diffrank_rankings_geneidcs'] = rankings_geneidcs adata['diffrank_scoreskey'] = 'zscores' return adata