Exemplo n.º 1
0
def countWordsReport(seqs, WordWidth=8, PeakWidth=100, PeakMargin=100):
    """ Produce a report of enriched words of specified length.
        seqs: DNA sequence data
        WordWidth: length of sought words
        PeakWidth: width of window around centre of sequence
        PeakMargin: the width of the margin on each side of the centre window
        (which delineates the positives around peak from negatives away from peak). """
    pos = RCDict()  # reverse complement-aware dictionary for DNA
    neg = RCDict()  # reverse complement-aware dictionary for DNA
    for seq in seqs:
        centre = len(seq) / 2  # find peak
        """ Construct all words around peak (positives) and count their presence """
        words = set(slidewin(seq[centre - PeakWidth / 2 : centre + PeakWidth / 2], WordWidth))
        for word in words:
            try:
                pos[word] += 1
            except KeyError:
                pos[word] = 1
        """ Construct all words away from peak (negatives) and count """
        words = set(slidewin(seq[: centre - PeakWidth / 2 - PeakMargin], WordWidth))
        words.union(slidewin(seq[centre + PeakWidth / 2 + PeakMargin :], WordWidth))
        for word in words:
            try:
                neg[word] += 1
            except KeyError:
                neg[word] = 1

    logratio = RCDict()  # DNA dictionary for storing the log-ration between pos and neg
    for (word, cnt_pos) in pos.items():
        cnt_neg = 0.0001
        try:
            cnt_neg = neg[word]
        except KeyError:
            pass
        logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))

    allpos = logratio.items()  # extract all pairs of words:log-ratio
    sortpos = sorted(allpos, key=lambda v: v[1], reverse=True)  # sort them
    print "Enriched words (sorted by ln pos/neg)"
    print "Word    \tln pos/neg\tE-value"
    for (word, lgr) in sortpos[0:100]:  # Look at the top-entries according to log-ratio, compute e-values
        cnt_pos = int(pos[word])
        try:
            cnt_neg = int(neg[word])
        except KeyError:
            cnt_neg = 0
        # Compute p-value using Fisher's Exact test
        pval = stats.getFETpval(
            cnt_pos,
            cnt_neg,
            len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos,
            len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg,
            False,
        )
        # Correct for multiple testing (very conservatively)
        eval = pval * len(allpos)
        print "%s\t%6.3f  \t%e" % (word, lgr, eval)
Exemplo n.º 2
0
def getGOReport(positives, background=None, database='UniProtKB'):
    """ Generate a complete GO term report for a set of genes (positives).
        Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
        Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
        (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
        E-value is a Bonferroni-corrected p-value.
        """
    pos = set(positives)
    fg_map = getGOTerms(pos, database)
    fg_list = []
    for id in fg_map:
        for t in fg_map[id]:
            fg_list.append(t)
    bg_map = {}
    bg_list = []
    neg = set()
    if background != None:
        neg = set(background).difference(pos)
        bg_map = getGOTerms(neg, database)
        for id in bg_map:
            for t in bg_map[id]:
                bg_list.append(t)
    term_set = set(fg_list)
    term_cnt = {}

    nPos = len(pos)
    nNeg = len(neg)
    if background == None:
        for t in term_set:
            term_cnt[t] = fg_list.count(t)
        sorted_cnt = sorted(list(term_cnt.items()),
                            key=lambda v: v[1],
                            reverse=True)
    else:  # a background is provided
        for t in term_set:
            fg_hit = fg_list.count(t)
            bg_hit = bg_list.count(t)
            fg_nohit = nPos - fg_hit
            bg_nohit = nNeg - bg_hit
            term_cnt[t] = (fg_hit, fg_hit + bg_hit,
                           stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit,
                                            False))
        sorted_cnt = sorted(list(term_cnt.items()),
                            key=lambda v: v[1][2],
                            reverse=False)

    ret = []
    for t in sorted_cnt:
        defin = getGODef(t[0])
        if background != None:
            ret.append((t[0], t[1][2] * len(term_set), t[1][0],
                        t[1][0] + t[1][1], defin['name']))
        else:
            ret.append((t[0], t[1], defin['name']))
    return ret
Exemplo n.º 3
0
    def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True):
        """ For a set of named gene products (positives) this method determines the enrichment of GO terms.
            Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided).
            Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached.
            Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
            E-value is a Bonferroni-corrected p-value.
            positives: names of gene products
            background: names of gene products (or None if all annotated gene products should be used; default)
            threshold: E-value that must be reached for term to be reported (default is 0.05)
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
            """
        # Process foreground: find terms of genes
        fg_list = [] # all terms, with multiple copies for counting
        fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) #
        for fg_gene in fg_map:
            for t in fg_map[fg_gene]:
                fg_list.append(t)
        nPos = len(positives)
        # Process background: find terms of genes
        bg_list = []
        if background == None: # need to use the full set
            background = list(self.annots.keys())
        negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
        nNeg = len(negatives)
        bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
        for bg_gene in bg_map:
            for t in bg_map[bg_gene]:
                bg_list.append(t)

        term_set = set(fg_list)
        term_cnt = {}

        if threshold == None:
            threshold = 0.05

        for t in term_set:
            fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes)
            bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes)
            fg_nohit = nPos - fg_hit  # total number of genes in foreground minus that number of hits
            bg_nohit = nNeg - bg_hit  # total number of genes in background minus that number of hits
            pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET
            evalue = pvalue * len(term_set) # Bonferroni correction
            if evalue <= threshold: # check if significance req is fulfilled
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
        sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False)
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
                print(('Could not find definition of %s' % t[0]))
            else:
                ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
        return ret
Exemplo n.º 4
0
    def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True):
        """ For a set of named gene products (positives) this method determines the enrichment of GO terms.
            Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided).
            Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached.
            Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]).
            E-value is a Bonferroni-corrected p-value.
            positives: names of gene products
            background: names of gene products (or None if all annotated gene products should be used; default)
            threshold: E-value that must be reached for term to be reported (default is 0.05)
            If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes).
            include_more_general: if True, include also more general GO terms annotated to gene products (default is True)
            """
        # Process foreground: find terms of genes
        fg_list = [] # all terms, with multiple copies for counting
        fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) #
        for fg_gene in fg_map:
            for t in fg_map[fg_gene]:
                fg_list.append(t)
        nPos = len(positives)
        # Process background: find terms of genes
        bg_list = []
        if background == None: # need to use the full set
            background = self.annots.keys()
        negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives
        nNeg = len(negatives)
        bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general)
        for bg_gene in bg_map:
            for t in bg_map[bg_gene]:
                bg_list.append(t)

        term_set = set(fg_list)
        term_cnt = {}

        if threshold == None:
            threshold = 0.05

        for t in term_set:
            fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes)
            bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes)
            fg_nohit = nPos - fg_hit  # total number of genes in foreground minus that number of hits
            bg_nohit = nNeg - bg_hit  # total number of genes in background minus that number of hits
            pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET
            evalue = pvalue * len(term_set) # Bonferroni correction
            if evalue <= threshold: # check if significance req is fulfilled
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue)
        sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)
        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
                print 'Could not find definition of %s' % t[0]
            else:
                ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0]))
        return ret
Exemplo n.º 5
0
    def getGOReport(self, positives, background = None, taxa = None, include_more_general = True):
        """ Generate a complete GO term report for a set of genes (positives).
            Each GO term is also assigned an enrichment p-value (on basis of background, if provided).
            Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR
            (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). 
            E-value is a Bonferroni-corrected p-value.
            """
        pos = set(positives)
        fg_map = self.getTerms(pos, include_more_general = include_more_general)
        fg_list = []
        for id in fg_map:
            for t in fg_map[id]:
                fg_list.append(t)
        bg_map = {}
        bg_list = []
        neg = set()
        if background != None:
            neg = set(background).difference(pos)
            bg_map = self.getTerms(neg, include_more_general = include_more_general)
            for id in bg_map:
                for t in bg_map[id]:
                    bg_list.append(t)
        term_set = set(fg_list)
        term_cnt = {}

        nPos = len(pos)
        nNeg = len(neg)
        if background == None:
            for t in term_set:
                term_cnt[t] = fg_list.count(t)
            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True)
        else: # a background is provided
            for t in term_set:
                fg_hit = fg_list.count(t)
                bg_hit = bg_list.count(t)
                fg_nohit = nPos - fg_hit
                bg_nohit = nNeg - bg_hit
                term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False))
            sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False)

        ret = []
        for t in sorted_cnt:
            defin = self.getTermdef(t[0])
            if defin == None:
                print 'Could not find definition of %s' % t[0]
            else:
                if background != None:
                    ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin[2], defin[0]))
                else:
                    ret.append((t[0], t[1], defin[2], defin[0]))
        return ret
Exemplo n.º 6
0
def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100):
    """ Produce a report of enriched words of specified length.
        seqs: DNA sequence data
        WordWidth: length of sought words
        PeakWidth: width of window around centre of sequence
        PeakMargin: the width of the margin on each side of the centre window
        (which delineates the positives around peak from negatives away from peak). """
    pos = RCDict() # reverse complement-aware dictionary for DNA
    neg = RCDict() # reverse complement-aware dictionary for DNA
    for seq in seqs:
        centre = len(seq)/2 # find peak
        """ Construct all words around peak (positives) and count their presence """
        words = set(slidewin(seq[centre-PeakWidth/2:centre+PeakWidth/2], WordWidth))
        for word in words:
            try:
                pos[word] += 1
            except KeyError:
                pos[word] = 1
        """ Construct all words away from peak (negatives) and count """
        words = set(slidewin(seq[:centre-PeakWidth/2-PeakMargin], WordWidth))
        words.union(slidewin(seq[centre+PeakWidth/2+PeakMargin:], WordWidth))
        for word in words:
            try:
                neg[word] += 1
            except KeyError:
                neg[word] = 1

    logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg
    for (word, cnt_pos) in list(pos.items()):
        cnt_neg = 0.0001
        try:
            cnt_neg = neg[word]
        except KeyError:
            pass
        logratio[word] = math.log(float(cnt_pos) / float(cnt_neg))

    allpos = list(logratio.items()) # extract all pairs of words:log-ratio
    sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them
    print("Enriched words (sorted by ln pos/neg)")
    print("Word    \tln pos/neg\tE-value")
    for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values
        cnt_pos = int(pos[word])
        try: cnt_neg = int(neg[word])
        except KeyError: cnt_neg = 0
        # Compute p-value using Fisher's Exact test
        pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False)
        # Correct for multiple testing (very conservatively)
        eval = pval * len(allpos)
        print("%s\t%6.3f  \t%e" % (word, lgr, eval))
Exemplo n.º 7
0
    def get_GO_term_overrepresentation(self, pos_entries, bg_entries = None, evalThreshold = None):

        if bg_entries == None:
            bg_entries = self.index.gene2go.keys()
        bg_entries = set(bg_entries)
        pos_entries = set(pos_entries)
        neg_entries = bg_entries - pos_entries

        # Obtain GO terms for each element in our positive and negative sets    
        pos_terms = [self.find_terms(e) for e in pos_entries]
        neg_terms = [self.find_terms(e) for e in neg_entries]
    
        # Collect all relevant GO terms (those found via the positives)
        allPos = set()
        for terms in pos_terms:
            allPos |= terms
    
        # Collect the other GO terms (those found via the negatives)
        all = set(allPos)
        for terms in neg_terms:
            all |= terms
    
        nTerms = len(all)

        # For each term, use Fisher's exact test to establish whether there is
        # a significant difference between occurrence of a term in each set
        allHits = {}
        for seek in allPos:
            pos_in, pos_out, neg_in, neg_out = 0, 0, 0, 0
            for terms in pos_terms:
                if seek in terms:
                    pos_in += 1
                else:
                    pos_out += 1
            for terms in neg_terms:
                if seek in terms:
                    neg_in += 1
                else:
                    neg_out += 1
            p_value = stats.getFETpval(pos_in, neg_in, pos_out, neg_out, left=False)
            # Correct for multiple hypothesis testing
            e_value = p_value * nTerms
            if (evalThreshold == None or evalThreshold >= e_value):
                allHits[seek] = e_value
        return allHits