示例#1
0
 def _getChi2Sum(self, RIDs):
     #print([GWAS[x] for x in RIDs])
     ps = np.zeros(len(RIDs))
     for i in range(0, len(ps)):
         #ps = chi2.ppf(1- np.array([GWAS[x] for x in RIDs]),1)
         ps[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail(
             self._GWAS[RIDs[i]])
     return np.sum(ps)
示例#2
0
    def test_gene_assocdir(self, gene, epsilon=1e-8):
        """
        Tests for directional association of the gene
        (Requires betas of GWAS)
        
        Args:
        
            gene(str): Gene symbol to test
            epsilon(float): Regularization parameter 
        """
        if gene not in self._GENESYMB:
            print(G, "not in annotation!")
            return

        # Get background
        D = self._GENEID[self._GENESYMB[gene]]

        REF = {}
        REF[D[0]] = self._ref.load_pos_reference(D[0])

        if len(self._GWAS_alleles) == 0:
            C, RID = self._calcGeneSNPcorr(D[0], self._GENESYMB[gene], REF)
        else:
            C, RID = self._calcGeneSNPcorr_wAlleles(D[0], self._GENESYMB[gene],
                                                    REF)

        # Regularize C
        A = (C + C.T) / 2
        evalu, evec = np.linalg.eigh(A)
        evalu[evalu < epsilon] = epsilon

        # Decompose
        C = evec.dot(np.diag(evalu)).dot(evec.T)

        L = np.linalg.cholesky(C)

        F = np.linalg.norm(L)**2

        # Get and calc stats
        z = np.array([
            np.sign(self._GWAS_beta[x]) * np.sqrt(
                tools.chiSquared1dfInverseCumulativeProbabilityUpperTail(
                    self._GWAS[x])) for x in RID
        ])
        #print("Right tail:",hpstats.onemin_norm_cdf_100d(np.sum(z),0,F))
        #print("Left tail :",hpstats.norm_cdf_100d(np.sum(z),0,F))

        return hpstats.onemin_norm_cdf_100d(np.sum(z), 0,
                                            F), hpstats.norm_cdf_100d(
                                                np.sum(z), 0, F)
示例#3
0
    def _calc_pw_enrichment(self, pw, TISSUES):
        RET = {}

        FAILS = []

        RAW = {}

        # Calc pathway enrichment
        for tissue in TISSUES:
            if tissue not in RAW:
                RAW[tissue] = []

            A = np.zeros(len(pw))
            df = 0
            for i in range(0, len(pw)):

                if pw[i] in self._GENESYMB:
                    eid = self._GENESYMB[pw[i]]
                    pos = np.where(TISSUES[tissue][0] == eid)

                    if (len(pos[0]) > 0):

                        A[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail(
                            TISSUES[tissue][2][pos[0][0]])
                        df = df + 1
                        RAW[tissue].append(TISSUES[tissue][2][pos[0][0]])
                    else:
                        if pw[i] not in FAILS:
                            FAILS.append(pw[i])
                            RAW[tissue].append(np.NaN)
                else:
                    if pw[i] not in FAILS:
                        FAILS.append(pw[i])
                        RAW[tissue].append(np.NaN)

            if df > 0:
                S = np.sum(A)
                p = hpstats.onemin_chi2_cdf(S, dof=df)
                RET[tissue] = p
            else:
                RET[tissue] = np.NaN

        return RET, FAILS, RAW
示例#4
0
    def score(self,
              modules,
              samples=100000,
              method='auto',
              mode='',
              reqacc=1e-100,
              threshold=False,
              parallel=1,
              nobar=False):
        """
        Scores a set of pathways/modules
        
        Args:
        
            modules(list): List of modules to score
            samples(int): # of random gene sets to draw
            method(string): Method to use to evaluate tail probability ('auto','davies','ruben','satterthwaite')
            mode(string): Precision mode to use ('','128b','100d')
            reqacc(float): requested accuracy 
            threshold(bool): Threshold p-value to reqacc
            nobar(bool): Show progress bar
            
        """
        # Compute fusion sets
        if self._fuse:
            COMPUTE_SET, FUSION_SET, R = self._genefusion(modules,
                                                          method=method,
                                                          mode=mode,
                                                          reqacc=reqacc,
                                                          threshold=threshold,
                                                          parallel=parallel,
                                                          nobar=nobar)
        else:
            COMPUTE_SET, FUSION_SET, R = self._nogenefusion(modules)

        # Build dictionary
        META_DIC = {}
        for m in R[0]:
            if m[0][:9] == 'METAGENE:':
                META_DIC[m[0]] = m[1]

        RESULT = []
        FAILS = R[1]

        # Compute chi2 values for all genes
        GENES = {}

        for G in self._genescorer._SCORES:
            GENES[
                G] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail(
                    self._genescorer._SCORES[G])

        G = list(GENES)

        for F in FUSION_SET:

            # Compute pathway score

            # Calc chi2
            chi = np.zeros(len(F[1]))
            gpval = np.zeros(len(F[1]))

            fail = 0

            for i in range(0, len(F[1])):
                if F[1][i] in GENES:
                    chi[i] = GENES[F[1][i]]
                    gpval[i] = self._genescorer._SCORES[F[1][i]]

                else:
                    fail = fail + 1
                    gpval[i] = np.NaN

                    #print("[WARNING]: No gene score for",F[1][i])

            L = len(chi) - fail

            S = np.sum(chi)

            if L > 1:

                counter = 0.

                B = np.ones(samples)

                # Sample background
                for i in range(0, samples):

                    # Draw len(L) random gene-sets
                    Rs = random.sample(G, L)

                    b_score = np.zeros(L)

                    for j in range(0, len(Rs)):
                        b_score[j] = GENES[Rs[j]]

                    #print(b_score)
                    #print(np.sum(b_score))

                    if np.sum(b_score) > S:
                        counter += 1.

                RESULT.append(
                    [F[0], F[1], gpval, (1 + counter) / (1 + samples)])

            else:
                if L == 1:
                    for j in range(0, len(F[1])):
                        if F[1][j] in self._genescorer._SCORES:
                            RESULT.append([
                                F[0], F[1], chi,
                                self._genescorer._SCORES[F[1][j]]
                            ])
                            break
                else:
                    RESULT.append([F[0], F[1], chi, np.NaN])

        return [RESULT, FAILS, META_DIC]
示例#5
0
    def score(self,
              modules,
              method='auto',
              mode='',
              reqacc=1e-100,
              threshold=False,
              parallel=1,
              nobar=False):
        """
        Scores a set of pathways/modules
        
        Args:
        
            modules(list): List of modules to score
            samples(int): # of random gene sets to draw
            method(string): Method to use to evaluate tail probability ('auto','davies','ruben','satterthwaite')
            mode(string): Precision mode to use ('','128b','100d')
            reqacc(float): requested accuracy 
            threshold(bool): Threshold p-value to reqacc
            nobar(bool): Show progress bar
            
        """
        # Compute fusion sets
        if self._fuse:
            COMPUTE_SET, FUSION_SET, R = self._genefusion(modules,
                                                          method=method,
                                                          mode=mode,
                                                          reqacc=reqacc,
                                                          threshold=threshold,
                                                          parallel=parallel,
                                                          nobar=nobar)
        else:
            COMPUTE_SET, FUSION_SET, R = self._nogenefusion(modules)

        # Build dictionary
        META_DIC = {}
        for m in R[0]:
            if m[0][:9] == 'METAGENE:':
                META_DIC[m[0]] = m[1]

        # Remove from ._SCORES to have almost same baseline for all modules
        for C in COMPUTE_SET:
            if C[:9] == 'METAGENE:' and C in self._genescorer._SCORES:
                del self._genescorer._SCORES[C]

        RESULT = []
        FAILS = R[1]

        # Score modules
        for F in FUSION_SET:

            # Note: Ranking inside the F loop because META-GENES have to be added on case by case basis
            # Rank gene scores
            L = list(self._genescorer._SCORES.keys())
            S = []

            for i in range(0, len(L)):
                S.append(self._genescorer._SCORES[L[i]])

            # Add meta genes
            for i in range(0, len(F[1])):
                if F[1][i][:9] == 'METAGENE:' and F[1][i] in META_DIC:
                    L.append(F[1][i])
                    S.append(META_DIC[F[1][i]])

                    # Remove metagene member genes from background gene list
                    mgenes = F[1][i][9:].split("_")

                    for g in mgenes:
                        if g in self._genescorer._SCORES:
                            I = L.index(g)
                            #print(I,"-", g)
                            del L[I]
                            del S[I]

            # Rank
            ra = np.argsort(S)

            RANKS = {}
            for i in range(0, len(ra)):
                RANKS[L[ra[i]]] = (i + 1.) / (len(L) + 1.
                                              )  # +1: Ranking t start at 1

            # Calc chi2
            chi = np.zeros(len(F[1]))
            gpval = np.zeros(len(F[1]))
            fail = 0
            for i in range(0, len(F[1])):
                if F[1][i] in RANKS:
                    chi[i] = tools.chiSquared1dfInverseCumulativeProbabilityUpperTail(
                        RANKS[F[1][i]])
                    gpval[i] = RANKS[F[1][i]]

                else:
                    fail = fail + 1
                    gpval[i] = np.NaN

                    #print("[WARNING]: No gene score for",F[1][i])

            # Calc p-value
            df = len(chi) - fail

            if df > 0:
                S = np.sum(chi)

                p = hpstats.onemin_chi2_cdf(S, dof=df)

                chi[chi == 0] = np.NaN
                RESULT.append([F[0], F[1], gpval, p])
            else:
                chi[chi == 0] = np.NaN
                RESULT.append([F[0], F[1], gpval, np.NaN])

        # Cleanup
        for G in COMPUTE_SET:
            if G in self._genescorer._SCORES:
                del self._genescorer._SCORES[G]

        # Return
        return [RESULT, FAILS, META_DIC]