def _getChi2Sum(self, RIDs): #print([GWAS[x] for x in RIDs]) ps = np.zeros(len(RIDs)) for i in range(0, len(ps)): #ps = chi2.ppf(1- np.array([GWAS[x] for x in RIDs]),1) ps[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail( self._GWAS[RIDs[i]]) return np.sum(ps)
def test_gene_assocdir(self, gene, epsilon=1e-8): """ Tests for directional association of the gene (Requires betas of GWAS) Args: gene(str): Gene symbol to test epsilon(float): Regularization parameter """ if gene not in self._GENESYMB: print(G, "not in annotation!") return # Get background D = self._GENEID[self._GENESYMB[gene]] REF = {} REF[D[0]] = self._ref.load_pos_reference(D[0]) if len(self._GWAS_alleles) == 0: C, RID = self._calcGeneSNPcorr(D[0], self._GENESYMB[gene], REF) else: C, RID = self._calcGeneSNPcorr_wAlleles(D[0], self._GENESYMB[gene], REF) # Regularize C A = (C + C.T) / 2 evalu, evec = np.linalg.eigh(A) evalu[evalu < epsilon] = epsilon # Decompose C = evec.dot(np.diag(evalu)).dot(evec.T) L = np.linalg.cholesky(C) F = np.linalg.norm(L)**2 # Get and calc stats z = np.array([ np.sign(self._GWAS_beta[x]) * np.sqrt( tools.chiSquared1dfInverseCumulativeProbabilityUpperTail( self._GWAS[x])) for x in RID ]) #print("Right tail:",hpstats.onemin_norm_cdf_100d(np.sum(z),0,F)) #print("Left tail :",hpstats.norm_cdf_100d(np.sum(z),0,F)) return hpstats.onemin_norm_cdf_100d(np.sum(z), 0, F), hpstats.norm_cdf_100d( np.sum(z), 0, F)
def _calc_pw_enrichment(self, pw, TISSUES): RET = {} FAILS = [] RAW = {} # Calc pathway enrichment for tissue in TISSUES: if tissue not in RAW: RAW[tissue] = [] A = np.zeros(len(pw)) df = 0 for i in range(0, len(pw)): if pw[i] in self._GENESYMB: eid = self._GENESYMB[pw[i]] pos = np.where(TISSUES[tissue][0] == eid) if (len(pos[0]) > 0): A[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail( TISSUES[tissue][2][pos[0][0]]) df = df + 1 RAW[tissue].append(TISSUES[tissue][2][pos[0][0]]) else: if pw[i] not in FAILS: FAILS.append(pw[i]) RAW[tissue].append(np.NaN) else: if pw[i] not in FAILS: FAILS.append(pw[i]) RAW[tissue].append(np.NaN) if df > 0: S = np.sum(A) p = hpstats.onemin_chi2_cdf(S, dof=df) RET[tissue] = p else: RET[tissue] = np.NaN return RET, FAILS, RAW
def score(self, modules, samples=100000, method='auto', mode='', reqacc=1e-100, threshold=False, parallel=1, nobar=False): """ Scores a set of pathways/modules Args: modules(list): List of modules to score samples(int): # of random gene sets to draw method(string): Method to use to evaluate tail probability ('auto','davies','ruben','satterthwaite') mode(string): Precision mode to use ('','128b','100d') reqacc(float): requested accuracy threshold(bool): Threshold p-value to reqacc nobar(bool): Show progress bar """ # Compute fusion sets if self._fuse: COMPUTE_SET, FUSION_SET, R = self._genefusion(modules, method=method, mode=mode, reqacc=reqacc, threshold=threshold, parallel=parallel, nobar=nobar) else: COMPUTE_SET, FUSION_SET, R = self._nogenefusion(modules) # Build dictionary META_DIC = {} for m in R[0]: if m[0][:9] == 'METAGENE:': META_DIC[m[0]] = m[1] RESULT = [] FAILS = R[1] # Compute chi2 values for all genes GENES = {} for G in self._genescorer._SCORES: GENES[ G] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail( self._genescorer._SCORES[G]) G = list(GENES) for F in FUSION_SET: # Compute pathway score # Calc chi2 chi = np.zeros(len(F[1])) gpval = np.zeros(len(F[1])) fail = 0 for i in range(0, len(F[1])): if F[1][i] in GENES: chi[i] = GENES[F[1][i]] gpval[i] = self._genescorer._SCORES[F[1][i]] else: fail = fail + 1 gpval[i] = np.NaN #print("[WARNING]: No gene score for",F[1][i]) L = len(chi) - fail S = np.sum(chi) if L > 1: counter = 0. B = np.ones(samples) # Sample background for i in range(0, samples): # Draw len(L) random gene-sets Rs = random.sample(G, L) b_score = np.zeros(L) for j in range(0, len(Rs)): b_score[j] = GENES[Rs[j]] #print(b_score) #print(np.sum(b_score)) if np.sum(b_score) > S: counter += 1. RESULT.append( [F[0], F[1], gpval, (1 + counter) / (1 + samples)]) else: if L == 1: for j in range(0, len(F[1])): if F[1][j] in self._genescorer._SCORES: RESULT.append([ F[0], F[1], chi, self._genescorer._SCORES[F[1][j]] ]) break else: RESULT.append([F[0], F[1], chi, np.NaN]) return [RESULT, FAILS, META_DIC]
def score(self, modules, method='auto', mode='', reqacc=1e-100, threshold=False, parallel=1, nobar=False): """ Scores a set of pathways/modules Args: modules(list): List of modules to score samples(int): # of random gene sets to draw method(string): Method to use to evaluate tail probability ('auto','davies','ruben','satterthwaite') mode(string): Precision mode to use ('','128b','100d') reqacc(float): requested accuracy threshold(bool): Threshold p-value to reqacc nobar(bool): Show progress bar """ # Compute fusion sets if self._fuse: COMPUTE_SET, FUSION_SET, R = self._genefusion(modules, method=method, mode=mode, reqacc=reqacc, threshold=threshold, parallel=parallel, nobar=nobar) else: COMPUTE_SET, FUSION_SET, R = self._nogenefusion(modules) # Build dictionary META_DIC = {} for m in R[0]: if m[0][:9] == 'METAGENE:': META_DIC[m[0]] = m[1] # Remove from ._SCORES to have almost same baseline for all modules for C in COMPUTE_SET: if C[:9] == 'METAGENE:' and C in self._genescorer._SCORES: del self._genescorer._SCORES[C] RESULT = [] FAILS = R[1] # Score modules for F in FUSION_SET: # Note: Ranking inside the F loop because META-GENES have to be added on case by case basis # Rank gene scores L = list(self._genescorer._SCORES.keys()) S = [] for i in range(0, len(L)): S.append(self._genescorer._SCORES[L[i]]) # Add meta genes for i in range(0, len(F[1])): if F[1][i][:9] == 'METAGENE:' and F[1][i] in META_DIC: L.append(F[1][i]) S.append(META_DIC[F[1][i]]) # Remove metagene member genes from background gene list mgenes = F[1][i][9:].split("_") for g in mgenes: if g in self._genescorer._SCORES: I = L.index(g) #print(I,"-", g) del L[I] del S[I] # Rank ra = np.argsort(S) RANKS = {} for i in range(0, len(ra)): RANKS[L[ra[i]]] = (i + 1.) / (len(L) + 1. ) # +1: Ranking t start at 1 # Calc chi2 chi = np.zeros(len(F[1])) gpval = np.zeros(len(F[1])) fail = 0 for i in range(0, len(F[1])): if F[1][i] in RANKS: chi[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail( RANKS[F[1][i]]) gpval[i] = RANKS[F[1][i]] else: fail = fail + 1 gpval[i] = np.NaN #print("[WARNING]: No gene score for",F[1][i]) # Calc p-value df = len(chi) - fail if df > 0: S = np.sum(chi) p = hpstats.onemin_chi2_cdf(S, dof=df) chi[chi == 0] = np.NaN RESULT.append([F[0], F[1], gpval, p]) else: chi[chi == 0] = np.NaN RESULT.append([F[0], F[1], gpval, np.NaN]) # Cleanup for G in COMPUTE_SET: if G in self._genescorer._SCORES: del self._genescorer._SCORES[G] # Return return [RESULT, FAILS, META_DIC]