def _identify_doublets_fisher(cluster_labels: Union[pd.Categorical, List[int]], pred_dbl: List[bool], alpha: float = 0.05) -> pd.DataFrame: df = pd.crosstab(cluster_labels, pred_dbl) ndbl = df[True].sum() a = df[True].values.astype(np.int32) b = df[False].values.astype(np.int32) c = ndbl - a d = (pred_dbl.size - ndbl) - b avg_dblr = ndbl / pred_dbl.size freqs = a / (a + b) from pegasus.cylib.cfisher import fisher_exact from statsmodels.stats.multitest import fdrcorrection as fdr _, pvals = fisher_exact(a, b, c, d) passed, qvals = fdr(pvals, alpha=alpha) posvec = np.where(passed)[0][freqs[passed] > avg_dblr] result = pd.DataFrame({ 'cluster': df.index[posvec], 'percentage': freqs[posvec] * 100.0, 'pval': pvals[posvec], 'qval': qvals[posvec] }) result.sort_values('percentage', ascending=False, inplace=True) result.reset_index(drop=True, inplace=True) return result
def singleTissue_eGene_stat(tissue, SNP_PCs, SNP_sampleList): # establish mapping between TRs and Genes tisGeneList, tisGene2ind = indexGeneList(tissue) locusi2tisGenei = getLocusi2tisGenei(tisGene2ind) tisGenMat = getTissueGenMat( tissue) # genMat with samples missing in tpmMat removed print(f'\ttisGenMat {tisGenMat.shape}') if glob.glob(f'{args.resDir}/{tissue}.ResMat.pickle'): tisResTpmMat = pickle.load( open(f'{args.resDir}/{tissue}.ResMat.pickle', 'rb')) else: tisResTpmMat = getTisSNPResTpmMat(tissue, SNP_PCs, SNP_sampleList) pickle.dump(tisResTpmMat, open(f'{args.outDir}/{tissue}.ResMat.pickle', 'wb')) print(f'\ttisResTpmMat {tisResTpmMat.shape}') genei2nloci = getGenei2nloci( locusi2tisGenei ) # count # of TRs mapped to each gene; used for Bonferroni correction tiseGeneTR, stats = runRegressionZ3( tisResTpmMat, tisGenMat, locusi2tisGenei, genei2nloci) # [genei, locusi], [p, b, bse] print(f'\t{tiseGeneTR.shape[0]} genes tested') rejected, adjP = fdr(stats[:, 0]) print(f'\t{np.sum(rejected)} tissue eGenes') eGeneStat = annotateGeneTR1(tissue, tisGeneList, genei2nloci, tiseGeneTR, stats, rejected, adjP) print(f'\t{eGeneStat.shape[0]} total eGenes') return eGeneStat
def _calc_qvals( nclust: int, pvals: np.ndarray, first_j: int, second_j: int, ) -> np.ndarray: """ Calculate FDR """ qvals = np.zeros(pvals.shape, dtype = np.float32) if second_j > 0: _, qval = fdr(pvals[:, first_j]) qvals[:, first_j] = qvals[:, second_j] = qval else: for j in range(nclust): _, qvals[:, j] = fdr(pvals[:, j]) return qvals
def fit(self, inds=None, y=None, vcfFile=None, trait=0): """ carry out GWAS y can be directly provided or obtained from pop object returns [ b, se, pvalue, fdr] can do for any trait=0:ntrait, or as provided in y vector - X(nind x nmkr array): contains genotypes - y(nind array): contains phenotypes - trait(int): trait index to be used, overriden if y provided [0] """ # get phenotype if y not provided if y is None: nind = len(inds) y = np.array(list(inds[i].y[trait] for i in range(nind))) else: nind = len(y) ''' # genotypes from vcfFile (deprecated) if X is None and vcfFile is None: sys.exit('genotypes must be in vcfFile or passed as X matrix') elif vcfFile is not None: f_open = gzip.open if vcfFile.endswith('.gz') else open sep = '\||/' with f_open(vcfFile) as f: for line in f: try: line = line.decode() except AttributeError: pass if line.startswith('#'): continue line = line.rstrip().split('\t') genotypes = line[9:] gt = [] for g in genotypes: ig = g.split(':')[0] gt.append(re.split(sep, ig)) # convert into 1D list and to int gt = np.array(list((map(int, sum(gt, []))))) # trick to join by individual genotypes, and then sum gt = np.split(gt, nind) gt = np.asarray(list(map(sum, gt))) b, intercept, r_value, p_value, std_err = stats.linregress(gt, y) out = np.append(out, [b, std_err, p_value]) ''' # output out = np.array([]) # genotypes provided in X for i in range(self.X.shape[0]): # trick to sum genotypes over haplotypes x = np.split(self.X[i, :], nind) x = np.asarray(list(map(sum, x))) b, intercept, r_value, p_value, std_err = stats.linregress(x, y) out = np.append(out, [b, std_err, p_value]) out = out.reshape(len(out) // 3, 3) self.b, self.se, self.pvalue = out[:,0], out[:,1], out[:,2] # FDR obtained from statsmodels package self.fdr = fdr(self.pvalue)[1]
def calc_fisher( clust_id: str, data: List[float], indices: List[int], indptr: List[int], shape: Tuple[int, int], cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], cnt_vec: List[int], verbose: bool, ) -> pd.DataFrame: """ Calcualte Fisher's exact test for one cluster """ import fisher # recover sparse matrix mat = csr_matrix((data, indices, indptr), shape=shape) mask = cluster_labels == clust_id mat_clust = mat[mask] if cond_labels is None: n1 = mat_clust.shape[0] n2 = shape[0] - n1 a_true = mat_clust.getnnz(axis=0).astype(np.uint) a_false = n1 - a_true b_true = cnt_vec.astype(np.uint) - a_true b_false = n2 - b_true else: cond1 = cond_labels.categories[0] cond_labs = cond_labels[mask] mask2 = cond_labs == cond1 mat_cond1 = mat_clust[mask2] mat_cond2 = mat_clust[~mask2] n1 = mat_cond1.shape[0] n2 = mat_cond2.shape[0] a_true = mat_cond1.getnnz(axis=0).astype(np.uint) a_false = n1 - a_true b_true = mat_cond2.getnnz(axis=0).astype(np.uint) b_false = n2 - b_true pvals = fisher.pvalue_npy(a_true, a_false, b_true, b_false)[2] passed, qvals = fdr(pvals) df = pd.DataFrame( { "fisher_pval:{0}".format(clust_id): pvals.astype(np.float32), "fisher_qval:{0}".format(clust_id): qvals.astype(np.float32), }, index=gene_names, ) if verbose: logger.info("calc_fisher finished for cluster {0}.".format(clust_id)) return df
def differential_analysis(test_norm, control_norm): ''' calculated fold change on log transformed data. ======== Paremeters: test_norm: pandas.dataframe, a dataframe of normalized test data, rows as cells and columns as features. control_norm: pandas.dataframe, a dataframe of normalized control data, rows as cells and columns as features. ''' report = pd.DataFrame( columns=['logFC', 'T_pValue', 'KS_pValue', 'adj_T_pVal', 'adj_KS_pVal']) for feature in control_norm.columns: fc = test_norm[feature].mean() - control_norm[feature].mean() pval = ttest(control_norm[feature], test_norm[feature]) ks_pval = ks_2samp(control_norm[feature], test_norm[feature])[1] report.loc[feature, 'logFC'] = np.round(fc, 2) report.loc[feature, 'T_pValue'] = pval[1] report.loc[feature, 'KS_pValue'] = ks_pval report['adj_T_pVal'] = fdr(report.T_pValue)[1] report['adj_KS_pVal'] = fdr(report.KS_pValue)[1] return report
def multi_comp_correction(r, p): from statsmodels.stats.multitest import fdrcorrection_twostage as fdr for_comp = [np.asarray(p) > 0] p_corr_fc = fdr(np.asarray(p)[for_comp], 0.05, 'bh')[1] p_corr = np.asarray(p) p_corr[for_comp] = p_corr_fc r_th = np.asarray(r) r_th[np.asarray(p_corr) > 0.05] = 0 r_th = list(r_th) p_corr = list(p_corr) return r_th, p_corr
def correct_fdr(data): """ After receiving the p-values for the test we should correct the multiple comparisons error. Here we using the FDR method. :param data: The list of genes for correction :return: list with the updated p-values """ data.sort(key=lambda x: x.getPValue()) pvals = [x.getPValue() for x in data] after_fdr = fdr(pvals, 0.05) for i in range(len(after_fdr[1])): data[i].setPValue(after_fdr[1][i]) return data
def multi_comp_correction(r, p): from statsmodels.stats.multitest import multipletests as fdr import copy print('Correction for multiple comparisons') r = np.asarray(r) p = np.asarray(p) for_comp = [p > 0] p_corr_fc = fdr(p[for_comp], 0.05, 'fdr_bh')[1] p_corr = p p_corr[for_comp] = p_corr_fc r_th = np.asarray(copy.deepcopy(r)) r_th[np.asarray(p_corr) > 0.05] = 0 return list(r),list(p),list(r_th)
def calc_fisher(i, clust_label, gene_names, ct, total): cpt = total - ct[:, i, :] pvals = fisher.pvalue_npy(ct[:, i, 0], ct[:, i, 1], cpt[:, 0], cpt[:, 1])[2] passed, qvals = fdr(pvals) df = pd.DataFrame( { "fisher_pval_{0}".format(clust_label): pvals, "fisher_qval_{0}".format(clust_label): qvals }, index=gene_names) print("Cluster {0} is processed.".format(clust_label)) return df
def fit(self, y): """ carry out GWAS """ nind = len(y) # output out = np.array([]) # genotypes provided in X for i in range(self.X.shape[0]): # trick to sum genotypes over haplotypes x = np.split(self.X[i, :], nind) x = np.asarray(list(map(sum, x))) b, intercept, r_value, p_value, std_err = stats.linregress(x, y) out = np.append(out, [b, std_err, p_value]) out = out.reshape(len(out) // 3, 3) self.b, self.se, self.pvalue = out[:, 0], out[:, 1], out[:, 2] # FDR obtained from statsmodels package self.fdr = fdr(self.pvalue)[1]
def corr_spec_net(r, p, net_name): from statsmodels.stats.multitest import multipletests as fdr import copy id_net = network_id_list(network_type=net_name) mask = np.ones(len(r), bool) mask[np.asarray(id_net) - 1] = False r = np.asarray(r) p = np.asarray(p) r[mask] = 0 p[mask] = 0 for_comp = [p > 0] p_corr_fc = fdr(p[for_comp], 0.05, 'fdr_bh')[1] p_corr = p p_corr[for_comp] = p_corr_fc r_th = np.asarray(copy.deepcopy(r)) r_th[np.asarray(p_corr) > 0.05] = 0 r_th = list(r_th) return r, p, r_th
def perform_oneway_anova(data, glist, restriction_vec, group_str, fdr_alpha = 0.05): selected = np.ones(data.shape[0], dtype = bool) for rest_str in restriction_vec: attr, value_str = rest_str.split(':') values = value_str.split(',') selected = selected & np.isin(data.obs[attr], values) gene_list = np.array(glist) gene_list = gene_list[np.isin(gene_list, data.var_names)] newdat = data[selected, :][:, gene_list].copy() newdat.X = newdat.X.toarray() group_attr, tmp_str = group_str.split(':') groups_str = tmp_str.split(';') ngr = len(groups_str) group_names = [] group_idx = np.zeros((ngr, newdat.shape[0]), dtype = bool) for i, gstr in enumerate(groups_str): name, values = gstr.split('~') group_names.extend([name + '_mean', name + '_percent']) group_idx[i] = np.isin(newdat.obs[group_attr], values.split(',')) np.warnings.filterwarnings('ignore') stats = np.zeros((len(gene_list), 3 + ngr * 2)) for i in range(len(gene_list)): arr_list = [] for j in range(group_idx.shape[0]): arr = newdat.X[group_idx[j], i] stats[i, 3 + j * 2] = arr.mean() stats[i, 3 + j * 2 + 1] = (arr > 0).sum() * 100.0 / arr.size arr_list.append(arr) stats[i, 0], stats[i, 1] = f_oneway(*arr_list) if np.isnan(stats[i, 0]): stats[i, 0] = 0.0 stats[i, 1] = 1.0 passed, stats[:, 2] = fdr(stats[:, 1]) cols = ['fstat', 'pval', 'qval'] cols.extend(group_names) raw_results = pd.DataFrame(stats, columns = cols, index = gene_list) results = raw_results[raw_results['qval'] <= fdr_alpha] results = results.sort_values('qval') return results, raw_results
def calc_mwu(clust_label, labels, conds, cond_order, gene_names, data, indices, indptr, shape): csc_mat = csc_matrix((data, indices, indptr), shape = shape) ngene = shape[1] log_fc = np.zeros(ngene) U_stats = np.zeros(ngene) pvals = np.zeros(ngene) idx = labels == clust_label exprs = np.zeros(idx.sum()) idx_x = conds[idx] == cond_order[0] idx_y = conds[idx] == cond_order[1] local_mat = csc_mat[idx, :] for j in range(ngene): vec = local_mat[:, j] if vec.size > 0: exprs[vec.indices] = vec.data log_fc[j] = np.mean(exprs[idx_x]) - np.mean(exprs[idx_y]) U_stats[j], pvals[j] = ss.mannwhitneyu(exprs[idx_x], exprs[idx_y], alternative = 'two-sided') else: log_fc[j] = 0.0 U_stats[j] = 0.0 pvals[j] = 1.0 exprs[:] = 0.0 passed, qvals = fdr(pvals) df = pd.DataFrame({"log_fc": log_fc, "mwu_U": U_stats, "mwu_pval": pvals, "mwu_qval": qvals}, index = gene_names) print("Cluster {0} is processed.".format(clust_label)) return df
def calc_mwu(clust_label, labels, gene_names, data, indices, indptr, shape): csc_mat = csc_matrix((data, indices, indptr), shape=shape) nsample = shape[0] ngene = shape[1] idx_x = labels == clust_label idx_y = ~idx_x exprs = np.zeros(nsample) U_stats = np.zeros(ngene) pvals = np.zeros(ngene) for j in range(ngene): exprs[:] = 0.0 vec = csc_mat[:, j] if vec.size > 0: exprs[vec.indices] = vec.data U_stats[j], pvals[j] = ss.mannwhitneyu(exprs[idx_x], exprs[idx_y], alternative='two-sided') else: U_stats[j] = 0.0 pvals[j] = 1.0 passed, qvals = fdr(pvals) df = pd.DataFrame( { "mwu_U_{0}".format(clust_label): U_stats, "mwu_pval_{0}".format(clust_label): pvals, "mwu_qval_{0}".format(clust_label): qvals }, index=gene_names) print("Cluster {0} is processed.".format(clust_label)) return df
expectedFrm = pd.DataFrame(resDict) expectedFrm = expectedFrm.T expectedFrm.index.name = 'connection_pair' expectedFrm['percent_rank_by_direction'] = np.nan isPos = expectedFrm['expected_direc'] == 'Positively connected' isNeg = expectedFrm['expected_direc'] == 'Negatively connected' expectedFrm.ix[ isPos, 'percent_rank_by_direction'] = expectedFrm['perc_rank_within_pert_type'] expectedFrm.ix[~isPos, 'percent_rank_by_direction'] = 1 - expectedFrm[ 'perc_rank_within_pert_type'] # plt.hist(expectedFrm['perc_rank'],30) #check FDR using percent ranks boolFDR, valFDR = fdr( pvals=expectedFrm['perc_rank_within_pert_type'].values) #,alpha=.05 expectedFrm['pass_FDR_with_percent_rank'] = boolFDR ### write expected connection summary outF = wkdir + '/expected_connection_summary.txt' expectedFrm.to_csv(outF, sep='\t', header=True, index=True) ## plot percent ranks overall plt.hist(expectedFrm['perc_rank_within_pert_type'], 30) plt.ylabel('freq', fontweight='bold') plt.xlabel('percent rank of expected connections', fontweight='bold') plt.title('All expected GEO connections') outF = path.join(wkdir, 'percent_rank_expected_connections.png') plt.savefig(outF, bbox_inches='tight', dpi=200) plt.close()
def perform_oneway_anova( data: AnnData, glist: List[str], restriction_vec: List[str], group_str: str, fdr_alpha: float = 0.05, res_key: str = None, ) -> pd.DataFrame: """Perform one way ANOVA on a subset of cells (restricted by restriction_vec) grouped by group_str and control FDR at fdr_alpha. Parameters ---------- data : `anndata` object An `anndata` object containing the expression matrix. glist : `list[str]` A list of gene symbols. restriction_vec : `list[str]` A vector of restrictions for selecting cells. Each restriction takes the format of attr:value,value,value group_str : `str` How to group selected cells for ANOVA analysis. If group_str is for pseudotime, it has two formats. 1) 'pseudotime:time:n', which divides cells by equal pseudotime invertal; 2) 'pseudotime:size:n' divides cells by equal number of cells. fdr_alpha : `float`, optional (default: 0.05) False discovery rate. res_key : `str`, optional (default: None) Store results into data using res_key, the grouping information is stored in obs and the results is stored in uns. Returns ------- `pandas.DataFrame` Results for genes that pass FDR control. Examples -------- >>> results = misc.perform_oneway_anova(data, ['CD3E', 'CD4', 'CD8'], [], 'pseudotime:size:10') """ from scipy.stats import f_oneway from statsmodels.stats.multitest import fdrcorrection as fdr selected = np.ones(data.shape[0], dtype=bool) for rest_str in restriction_vec: attr, value_str = rest_str.split(":") values = value_str.split(",") selected = selected & np.isin(data.obs[attr], values) gene_list = np.array(glist) gene_list = gene_list[np.isin(gene_list, data.var_names)] ngene = gene_list.size newdat = data[selected, :][:, gene_list].copy() newdat.X = newdat.X.toarray() group_values = group_str.split(":") group_names = [] col_names = [] ngr = 0 group_idx = None if group_values[0] == "pseudotime": assert len(group_values) == 3 div_by = group_values[1] ngr = int(group_values[2]) group_idx = np.zeros((ngr, newdat.shape[0]), dtype=bool) pseudotimes = newdat.obs["pseudotime"].values min_t = pseudotimes.min() max_t = pseudotimes.max() if div_by == "time": interval = (max_t - min_t) / ngr left = min_t - 1e-5 for i in range(ngr): right = min_t + interval * (i + 1) name = "({:.2f}, {:.2f}]".format(left if left >= 0 else 0.0, right) group_names.append(name) group_idx[i] = (pseudotimes > left) & (pseudotimes <= right) left = right else: assert div_by == "size" ords = np.argsort(pseudotimes) quotient = ords.size // ngr residule = ords.size % ngr fr = 0 for i in range(ngr): to = fr + quotient + (i < residule) name = "[{:.2f}, {:.2f}]".format(pseudotimes[ords[fr]], pseudotimes[ords[to - 1]]) group_names.append(name) group_idx[i][ords[fr:to]] = True fr = to else: assert len(group_values) == 2 group_attr = group_values[0] tmp_str = group_values[1] groups_str = tmp_str.split(";") ngr = len(groups_str) group_idx = np.zeros((ngr, newdat.shape[0]), dtype=bool) for i, gstr in enumerate(groups_str): name, values = gstr.split("~") group_names.append(name) group_idx[i] = np.isin(newdat.obs[group_attr], values.split(",")) for i in range(ngr): print("Group {} has {} cells.".format(group_names[i], group_idx[i].sum())) np.warnings.filterwarnings("ignore") stats = np.zeros((ngene, 3 + ngr * 2)) for i in range(ngene): arr_list = [] for j in range(ngr): arr = newdat.X[group_idx[j], i] stats[i, 3 + j * 2] = arr.mean() stats[i, 3 + j * 2 + 1] = (arr > 0).sum() * 100.0 / arr.size arr_list.append(arr) stats[i, 0], stats[i, 1] = f_oneway(*arr_list) if np.isnan(stats[i, 0]): stats[i, 0] = 0.0 stats[i, 1] = 1.0 passed, stats[:, 2] = fdr(stats[:, 1]) cols = ["fstat", "pval", "qval"] for i in range(ngr): cols.extend([group_names[i] + "_mean", group_names[i] + "_percent"]) raw_results = pd.DataFrame(stats, columns=cols, index=gene_list) results = raw_results[raw_results["qval"] <= fdr_alpha] results = results.sort_values("qval") if res_key is not None: data.uns[res_key] = raw_results data.obs[res_key] = "background" for i in range(ngr): idx = np.zeros(data.shape[0], dtype=bool) idx[selected] = group_idx[i] data.obs.loc[idx, res_key] = group_names[i] return results
def calc_stat_and_t(i, clust_label, labels, gene_names, data, indices, indptr, shape, sm1, sm2, ct, total): mat = csr_matrix((data, indices, indptr), shape=shape) n = shape[0] pvals = np.zeros(shape[1]) qvals = np.zeros(shape[1]) percent_fold_change = np.zeros(shape[1]) mask = labels == clust_label clust_mat = mat[mask] n1 = clust_mat.shape[0] n2 = n - n1 assert n1 > 1 and n2 > 1 sm1_1 = clust_mat.sum(axis=0).A1 sm2_1 = clust_mat.power(2).sum(axis=0).A1 mean1 = sm1_1 / n1 mean2 = (sm1 - sm1_1) / n2 s1sqr = (sm2_1 - n1 * (mean1**2)) / (n1 - 1) s2sqr = ((sm2 - sm2_1) - n2 * (mean2**2)) / (n2 - 1) var_est = s1sqr / n1 + s2sqr / n2 pvals[:] = 1.01 qvals[:] = 1.01 idx = var_est > 0.0 if idx.sum() > 0: tscore = (mean1[idx] - mean2[idx]) / np.sqrt(var_est[idx]) v = (var_est[idx]**2) / ((s1sqr[idx] / n1)**2 / (n1 - 1) + (s2sqr[idx] / n2)**2 / (n2 - 1)) pvals[idx] = ss.t.sf(np.fabs(tscore), v) * 2.0 # two-sided passed, qvals[idx] = fdr(pvals[idx]) # calculate WAD, Weighted Average Difference, https://almob.biomedcentral.com/articles/10.1186/1748-7188-3-8 log_fold_change = mean1 - mean2 x_avg = (mean1 + mean2) / 2 x_max = x_avg.max() x_min = x_avg.min() - 0.001 # to avoid divide by zero weights = (x_avg - x_min) / (x_max - x_min) wads = log_fold_change * weights # calculate percentage expressed and percent fold change percents = ct[:, i, 0] / (ct[:, i, 0] + ct[:, i, 1]) * 100.0 cpt = total - ct[:, i, :] percents_other = cpt[:, 0] / (cpt[:, 0] + cpt[:, 1]) * 100.0 idx = percents > 0.0 idx_other = percents_other > 0.0 percent_fold_change[(~idx) & (~idx_other)] = 0.0 percent_fold_change[idx & (~idx_other)] = np.inf percent_fold_change[ idx_other] = percents[idx_other] / percents_other[idx_other] df = pd.DataFrame( { "percentage_{0}".format(clust_label): percents, "percentage_other_{0}".format(clust_label): percents_other, "mean_log_expression_{0}".format(clust_label): mean1, "percentage_fold_change_{0}".format(clust_label): percent_fold_change, "log_fold_change_{0}".format(clust_label): log_fold_change, "WAD_score_{0}".format(clust_label): wads, "t_pval_{0}".format(clust_label): pvals, "t_qval_{0}".format(clust_label): qvals }, index=gene_names) print("Cluster {0} is processed.".format(clust_label)) return df
tarRanks.append(rnk) tarRnkPercs.append(RnkPerc) #write summary table sig = resCids[iq] f.write('\t'.join([sig,pDescDict[pert],target,str(cs),str(rnk),str(RnkPerc)[:7]]) + '\n') targetCS[cell1][pert][target] = tarCS targetRnkPercs[cell1][pert][target] = tarRnkPercs targetRanks[cell1][pert][target] = tarRanks ### put RnkPercs into a vector, test for FDR pVec = [] for pert in targetRnkPercs[cell1]: RnkPercs = targetRnkPercs[cell1][pert].values() for RnkPerc in RnkPercs: pVec.extend(RnkPerc) #perform FDR on pVec [pBoolean, pCorrected] = fdr(pVec, alpha=0.1, method='indep') nPassFDR = sum(pBoolean) if nPassFDR: iPassFDR = [i for i,x in enumerate(pBoolean) if x == True] pPass = [pVec[i] for i in iPassFDR] pMaxThresh = max(pPass) #what is the largest RnkPerc that passed FDR #flag connections which pass FDR outF = os.path.join(celldir,cell1 + '_drug-target_FDR_summary.txt') headers = ['query_sig','cp_pert_desc','target_KD_cgs','cs', 'query_rank', 'RnkPerc'] with open(outF,'w') as f: f.write('\t'.join(headers) + '\n') for pert in targetRnkPercs[cell1]: qInds = queryInd[pert] for target in targetRnkPercs[cell1][pert]: RnkPercs = targetRnkPercs[cell1][pert][target] for i,RnkPerc in enumerate(RnkPercs):
def calc_mwu( clust_id: str, data: List[float], indices: List[int], indptr: List[int], shape: Tuple[int, int], cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], verbose: bool, ) -> pd.DataFrame: """ Run Mann-Whitney U test for one cluster """ csc_mat = csc_matrix((data, indices, indptr), shape=shape) U_stats = np.zeros(shape[1], dtype=np.float32) pvals = np.full(shape[1], 1.0) mask = cluster_labels == clust_id if cond_labels is None: exprs = np.zeros(shape[0]) idx_x = mask idx_y = ~idx_x else: exprs = None cond1 = cond_labels.categories[0] cond_labs = cond_labels[mask] idx_x = cond_labs == cond1 idx_y = ~idx_x n1 = idx_x.sum() n2 = idx_y.sum() if n1 > 0 and n2 > 0: import scipy.stats as ss for i in range(shape[1]): if cond_labels is None: if csc_mat.indptr[i + 1] - csc_mat.indptr[i] > 0: exprs[:] = 0.0 exprs[csc_mat.indices[csc_mat.indptr[i]:csc_mat.indptr[ i + 1]]] = csc_mat.data[csc_mat.indptr[i]:csc_mat. indptr[i + 1]] U_stats[i], pvals[i] = ss.mannwhitneyu( exprs[idx_x], exprs[idx_y], alternative="two-sided") else: tmp_mat = csc_mat[mask, i] if tmp_mat.data.size > 0: exprs = tmp_mat.toarray()[:, 0] U_stats[i], pvals[i] = ss.mannwhitneyu( exprs[idx_x], exprs[idx_y], alternative="two-sided") passed, qvals = fdr(pvals) df = pd.DataFrame( { "mwu_U:{0}".format(clust_id): U_stats.astype(np.float32), "mwu_pval:{0}".format(clust_id): pvals.astype(np.float32), "mwu_qval:{0}".format(clust_id): qvals.astype(np.float32), }, index=gene_names, ) if verbose: logger.info("calc_mwu finished for cluster {0}.".format(clust_id)) return df
continue else: count = count + 1 sKeysStr.append(cpRes.index[i].split('_')[1]) yVals = count plt.scatter(rnk,yVals) plt.xlim((0, 100)) plt.ylim((0,count+1)) plt.yticks(range(1, count + 2), sKeysStr, rotation = 0) plt.xlabel('percent rank') plt.ylabel('cell line') plt.title(pDescDict[brd] + ' - ' + ind + ' connection - ' + gp_type) plt.savefig(os.path.join(work_dir,'drug_target_graphs',brd +'_' + ind + '_percent_rank.png')) plt.close() ### perform FDR correction FDRtest = fdr(pVec) pArr= np.array(pVec) passed = pArr[FDRtest[0]] pThreshFDR = max(passed) cpPass = [] kdPass = [] print 'connections passing FDR correction:' for test in pDict: if pDict[test] < pThreshFDR: print test drug = test.split('-')[0] + '-' + test.split('-')[1] cpPass.append(drug) kdPass.append(test.split('-')[-1]) #make graphs of connections that pass FRD for ibrd,brd in enumerate(cpPass): #skip if not in query
def calc_t( clust_id: str, data: List[float], indices: List[int], indptr: List[int], shape: Tuple[int, int], cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], sum_vec: List[float], sum2_vec: List[float], verbose: bool, ) -> pd.DataFrame: """ Calcualte Welch's t-test for one cluster """ # recover sparse matrix mat = csr_matrix((data, indices, indptr), shape=shape) pvals = np.full(shape[1], 1.0) tscores = np.full(shape[1], 0) mask = cluster_labels == clust_id mat_clust = mat[mask] if cond_labels is None: n1 = mat_clust.shape[0] n2 = shape[0] - n1 if n1 > 1 and n2 > 1: sum_clu = mat_clust.sum(axis=0).A1 mean1 = sum_clu / n1 mean2 = (sum_vec - sum_clu) / n2 mean2[mean2 < 0.0] = 0.0 sum2_clu = mat_clust.power(2).sum(axis=0).A1 s1sqr = (sum2_clu - n1 * (mean1**2)) / (n1 - 1) s2sqr = ((sum2_vec - sum2_clu) - n2 * (mean2**2)) / (n2 - 1) s2sqr[s2sqr < 0.0] = 0.0 else: cond1 = cond_labels.categories[0] cond_labs = cond_labels[mask] mask2 = cond_labs == cond1 mat_cond1 = mat_clust[mask2] mat_cond2 = mat_clust[~mask2] n1 = mat_cond1.shape[0] n2 = mat_cond2.shape[0] if n1 > 1 and n2 > 1: mean1 = mat_cond1.mean(axis=0).A1 psum1 = mat_cond1.power(2).sum(axis=0).A1 s1sqr = (psum1 - n1 * (mean1**2)) / (n1 - 1) mean2 = mat_cond2.mean(axis=0).A1 psum2 = mat_cond2.power(2).sum(axis=0).A1 s2sqr = (psum2 - n2 * (mean2**2)) / (n2 - 1) if n1 > 1 and n2 > 1: import scipy.stats as ss var_est = s1sqr / n1 + s2sqr / n2 idx = var_est > 0.0 if idx.sum() > 0: tscore = (mean1[idx] - mean2[idx]) / np.sqrt(var_est[idx]) v = (var_est[idx]**2) / ((s1sqr[idx] / n1)**2 / (n1 - 1) + (s2sqr[idx] / n2)**2 / (n2 - 1)) pvals[idx] = ss.t.sf(np.fabs(tscore), v) * 2.0 # two-sided tscores[idx] = tscore passed, qvals = fdr(pvals) df = pd.DataFrame( { "t_pval:{0}".format(clust_id): pvals.astype(np.float32), "t_qval:{0}".format(clust_id): qvals.astype(np.float32), "t_score:{0}".format(clust_id): tscores.astype(np.float32), }, index=gene_names, ) if verbose: logger.info("calc_t finished for cluster {0}.".format(clust_id)) return df
'perc_rank_within_pert_type':ePerc, 'expected_direc':eDir, 'a_name':aName} expectedFrm = pd.DataFrame(resDict) expectedFrm = expectedFrm.T expectedFrm.index.name = 'connection_pair' expectedFrm['percent_rank_by_direction'] = np.nan isPos = expectedFrm['expected_direc'] == 'Positively connected' isNeg = expectedFrm['expected_direc'] == 'Negatively connected' expectedFrm.ix[isPos,'percent_rank_by_direction'] = expectedFrm['perc_rank_within_pert_type'] expectedFrm.ix[~isPos,'percent_rank_by_direction'] = 1-expectedFrm['perc_rank_within_pert_type'] # plt.hist(expectedFrm['perc_rank'],30) #check FDR using percent ranks boolFDR, valFDR = fdr(pvals=expectedFrm['perc_rank_within_pert_type'].values) #,alpha=.05 expectedFrm['pass_FDR_with_percent_rank'] = boolFDR ### write expected connection summary outF = wkdir+'/expected_connection_summary.txt' expectedFrm.to_csv(outF,sep='\t',header=True,index=True) ## plot percent ranks overall plt.hist(expectedFrm['perc_rank_within_pert_type'],30) plt.ylabel('freq',fontweight='bold') plt.xlabel('percent rank of expected connections',fontweight='bold') plt.title('All expected GEO connections') outF = path.join(wkdir, 'percent_rank_expected_connections.png') plt.savefig(outF, bbox_inches='tight',dpi=200) plt.close()
sns.histplot(pvalIMG2[np.where(mask_data > 0)]) np.min(pvalIMG2[np.where(mask_data > 0)]) inv = copy.deepcopy(pvalIMG2) inv[np.where(mask_data > 0)] = 1. / inv[np.where(mask_data > 0)] inv10 = np.log10(inv) utils.show_slices(np.log10(inv), isPath=False) inv10IMG = nib.Nifti1Image(inv10, Tpt_img.affine) nib.save(inv10IMG, join(T1wDirTemplatePatients01, 'pval_diff_log10inv.nii.gz')) pvalIMG2_threshold = copy.deepcopy(pvalIMG2) pvalIMG2_threshold[np.where(pvalIMG2_threshold > 0.5 / len(x))] = 0 utils.show_slices(pvalIMG2_threshold, isPath=False) trues = fdr(pvalIMG2[np.where(mask_data > 0)], alpha=0.05)[0] any(trues) pvalIMG2_fdr = copy.deepcopy(pvalIMG2) pvalIMG2_fdr[np.where(mask_data > 0)] = fdr( pvalIMG2[np.where(mask_data > 0)])[1] pvalIMG2_fdr_threshold = copy.deepcopy(pvalIMG2_fdr) pvalIMG2_fdr_threshold[np.where(pvalIMG2_fdr_threshold > 0.05)] = 0 utils.show_slices(pvalIMG2_fdr_threshold, isPath=False) sns.histplot(pvalIMG2_fdr_threshold[np.where(mask_data > 0)]) invFDR = copy.deepcopy(pvalIMG2_fdr) invFDR[np.where(mask_data > 0)] = 1. / invFDR[np.where(mask_data > 0)] utils.show_slices(pvalIMG2_fdr_threshold, isPath=False) (pvalIMG2_fdr_threshold > 0).any()
sKeysStr.append(cpRes.index[i].split('_')[1]) yVals = count plt.scatter(rnk, yVals) plt.xlim((0, 100)) plt.ylim((0, count + 1)) plt.yticks(range(1, count + 2), sKeysStr, rotation=0) plt.xlabel('percent rank') plt.ylabel('cell line') plt.title(pDescDict[brd] + ' - ' + ind + ' connection - ' + gp_type) plt.savefig( os.path.join(work_dir, 'drug_target_graphs', brd + '_' + ind + '_percent_rank.png')) plt.close() ### perform FDR correction FDRtest = fdr(pVec) pArr = np.array(pVec) passed = pArr[FDRtest[0]] pThreshFDR = max(passed) cpPass = [] kdPass = [] print 'connections passing FDR correction:' for test in pDict: if pDict[test] < pThreshFDR: print test drug = test.split('-')[0] + '-' + test.split('-')[1] cpPass.append(drug) kdPass.append(test.split('-')[-1]) #make graphs of connections that pass FRD for ibrd, brd in enumerate(cpPass): #skip if not in query
####################################### # ------ (5) COMPARE y/X DISTs ------ # # (i) Individual Y's holder_prop = [] for cn in val_Y.columns.drop(['caseid','operyr']): tmp_df = pd.concat([pd.DataFrame({'y':val_Y[cn].copy(),'tt':'SK'}), pd.DataFrame({'y':df_Y[cn].copy(), 'tt':'NSQIP'})]).query('y>=0').reset_index(None,True) tmp_tbl = tmp_df.groupby(['y','tt']).size().reset_index().pivot('tt','y',0).fillna(0).astype(int) pval = stats.chi2_contingency(tmp_tbl.values)[1] tmp_prop = tmp_tbl.divide(tmp_tbl.sum(1),axis=0).reset_index().melt('tt').query('y==1').drop(columns='y') tmp_prop = tmp_prop.assign(outcome=cn,pval=pval) holder_prop.append(tmp_prop) dist_Y = pd.concat(holder_prop).reset_index(None,True) # Get the FDR values dist_Y = dist_Y.merge(dist_Y.groupby('outcome').pval.max().reset_index().assign(fdr=lambda x: fdr(x.pval, alpha=0.10)[1])) dist_Y = dist_Y.assign(outcome=lambda x: x.outcome.map(di_mapper)) # (ii) Aggregate Y's holder_Yagg = [] for ay in di_agg: tmp_cn = list(np.setdiff1d(di_agg[ay],'othseshock')) tmp_df = pd.concat([pd.DataFrame({'y':np.where(val_Y[tmp_cn].sum(1)==0, 0, 1),'tt':'SK'}), pd.DataFrame({'y':df_Y['agg_'+ay], 'tt':'NSQIP'})]).reset_index(None,True) tmp_tbl = tmp_df.groupby(['y','tt']).size().reset_index().pivot('tt','y',0).fillna(0).astype(int) pval = stats.chi2_contingency(tmp_tbl.values)[1] tmp_prop = tmp_tbl.divide(tmp_tbl.sum(1),axis=0).reset_index().melt('tt').query('y==1').drop(columns='y') tmp_prop = tmp_prop.assign(outcome=ay,pval=pval) holder_Yagg.append(tmp_prop) dist_Yagg = pd.concat(holder_Yagg).reset_index(None,True) dist_Yagg = dist_Yagg.assign(version=lambda x: x.outcome.str.replace('[^0-9]','',regex=True).replace('', '1').astype(int),