def countWordsReport(seqs, WordWidth=8, PeakWidth=100, PeakMargin=100): """ Produce a report of enriched words of specified length. seqs: DNA sequence data WordWidth: length of sought words PeakWidth: width of window around centre of sequence PeakMargin: the width of the margin on each side of the centre window (which delineates the positives around peak from negatives away from peak). """ pos = RCDict() # reverse complement-aware dictionary for DNA neg = RCDict() # reverse complement-aware dictionary for DNA for seq in seqs: centre = len(seq) / 2 # find peak """ Construct all words around peak (positives) and count their presence """ words = set(slidewin(seq[centre - PeakWidth / 2 : centre + PeakWidth / 2], WordWidth)) for word in words: try: pos[word] += 1 except KeyError: pos[word] = 1 """ Construct all words away from peak (negatives) and count """ words = set(slidewin(seq[: centre - PeakWidth / 2 - PeakMargin], WordWidth)) words.union(slidewin(seq[centre + PeakWidth / 2 + PeakMargin :], WordWidth)) for word in words: try: neg[word] += 1 except KeyError: neg[word] = 1 logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg for (word, cnt_pos) in pos.items(): cnt_neg = 0.0001 try: cnt_neg = neg[word] except KeyError: pass logratio[word] = math.log(float(cnt_pos) / float(cnt_neg)) allpos = logratio.items() # extract all pairs of words:log-ratio sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them print "Enriched words (sorted by ln pos/neg)" print "Word \tln pos/neg\tE-value" for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values cnt_pos = int(pos[word]) try: cnt_neg = int(neg[word]) except KeyError: cnt_neg = 0 # Compute p-value using Fisher's Exact test pval = stats.getFETpval( cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False, ) # Correct for multiple testing (very conservatively) eval = pval * len(allpos) print "%s\t%6.3f \t%e" % (word, lgr, eval)
def getGOReport(positives, background=None, database='UniProtKB'): """ Generate a complete GO term report for a set of genes (positives). Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). E-value is a Bonferroni-corrected p-value. """ pos = set(positives) fg_map = getGOTerms(pos, database) fg_list = [] for id in fg_map: for t in fg_map[id]: fg_list.append(t) bg_map = {} bg_list = [] neg = set() if background != None: neg = set(background).difference(pos) bg_map = getGOTerms(neg, database) for id in bg_map: for t in bg_map[id]: bg_list.append(t) term_set = set(fg_list) term_cnt = {} nPos = len(pos) nNeg = len(neg) if background == None: for t in term_set: term_cnt[t] = fg_list.count(t) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1], reverse=True) else: # a background is provided for t in term_set: fg_hit = fg_list.count(t) bg_hit = bg_list.count(t) fg_nohit = nPos - fg_hit bg_nohit = nNeg - bg_hit term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False)) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False) ret = [] for t in sorted_cnt: defin = getGODef(t[0]) if background != None: ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0] + t[1][1], defin['name'])) else: ret.append((t[0], t[1], defin['name'])) return ret
def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True): """ For a set of named gene products (positives) this method determines the enrichment of GO terms. Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided). Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached. Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). E-value is a Bonferroni-corrected p-value. positives: names of gene products background: names of gene products (or None if all annotated gene products should be used; default) threshold: E-value that must be reached for term to be reported (default is 0.05) If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes). include_more_general: if True, include also more general GO terms annotated to gene products (default is True) """ # Process foreground: find terms of genes fg_list = [] # all terms, with multiple copies for counting fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) # for fg_gene in fg_map: for t in fg_map[fg_gene]: fg_list.append(t) nPos = len(positives) # Process background: find terms of genes bg_list = [] if background == None: # need to use the full set background = list(self.annots.keys()) negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives nNeg = len(negatives) bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general) for bg_gene in bg_map: for t in bg_map[bg_gene]: bg_list.append(t) term_set = set(fg_list) term_cnt = {} if threshold == None: threshold = 0.05 for t in term_set: fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes) bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes) fg_nohit = nPos - fg_hit # total number of genes in foreground minus that number of hits bg_nohit = nNeg - bg_hit # total number of genes in background minus that number of hits pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET evalue = pvalue * len(term_set) # Bonferroni correction if evalue <= threshold: # check if significance req is fulfilled term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue) sorted_cnt = sorted(list(term_cnt.items()), key=lambda v: v[1][2], reverse=False) ret = [] for t in sorted_cnt: defin = self.getTermdef(t[0]) if defin == None: print(('Could not find definition of %s' % t[0])) else: ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0])) return ret
def getEnrichmentReport(self, positives, background = None, evid = None, threshold = None, include_more_general = True): """ For a set of named gene products (positives) this method determines the enrichment of GO terms. Each GO term is also assigned an enrichment p-value (on basis of provided background, or on basis of all annotated genes, if not provided). Note that to use the full set as background can be computationally expensive, so to speed up subsequent runs, the results are cached. Returns a list of tuples (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). E-value is a Bonferroni-corrected p-value. positives: names of gene products background: names of gene products (or None if all annotated gene products should be used; default) threshold: E-value that must be reached for term to be reported (default is 0.05) If evid(ence) is specified the method returns only entries with that specific evidence code (see header of file for codes). include_more_general: if True, include also more general GO terms annotated to gene products (default is True) """ # Process foreground: find terms of genes fg_list = [] # all terms, with multiple copies for counting fg_map = self.getTerms4Genes(positives, evid = evid, include_more_general = include_more_general) # for fg_gene in fg_map: for t in fg_map[fg_gene]: fg_list.append(t) nPos = len(positives) # Process background: find terms of genes bg_list = [] if background == None: # need to use the full set background = self.annots.keys() negatives = set(background).difference(set(positives)) # remove the positives from the background to create genuine negatives nNeg = len(negatives) bg_map = self.getTerms4Genes(negatives, evid = evid, include_more_general = include_more_general) for bg_gene in bg_map: for t in bg_map[bg_gene]: bg_list.append(t) term_set = set(fg_list) term_cnt = {} if threshold == None: threshold = 0.05 for t in term_set: fg_hit = fg_list.count(t) # number of foreground genes WITH GO term (number of terms in the list for the collective set of foreground genes) bg_hit = bg_list.count(t) # number of background genes WITH GO term (number of terms in the list for the collective set of background genes) fg_nohit = nPos - fg_hit # total number of genes in foreground minus that number of hits bg_nohit = nNeg - bg_hit # total number of genes in background minus that number of hits pvalue = stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False) # one-tailed FET evalue = pvalue * len(term_set) # Bonferroni correction if evalue <= threshold: # check if significance req is fulfilled term_cnt[t] = (fg_hit, fg_hit + bg_hit, evalue) sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) ret = [] for t in sorted_cnt: defin = self.getTermdef(t[0]) if defin == None: print 'Could not find definition of %s' % t[0] else: ret.append((t[0], t[1][2], t[1][0], t[1][1], defin[2], defin[0])) return ret
def getGOReport(self, positives, background = None, taxa = None, include_more_general = True): """ Generate a complete GO term report for a set of genes (positives). Each GO term is also assigned an enrichment p-value (on basis of background, if provided). Returns a list of tuples (GO_Term_ID[str], Foreground_no[int], Term_description[str]) with no background, OR (GO_Term_ID[str], E-value[float], Foreground_no[int], Background_no[int], Term_description[str]). E-value is a Bonferroni-corrected p-value. """ pos = set(positives) fg_map = self.getTerms(pos, include_more_general = include_more_general) fg_list = [] for id in fg_map: for t in fg_map[id]: fg_list.append(t) bg_map = {} bg_list = [] neg = set() if background != None: neg = set(background).difference(pos) bg_map = self.getTerms(neg, include_more_general = include_more_general) for id in bg_map: for t in bg_map[id]: bg_list.append(t) term_set = set(fg_list) term_cnt = {} nPos = len(pos) nNeg = len(neg) if background == None: for t in term_set: term_cnt[t] = fg_list.count(t) sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1], reverse=True) else: # a background is provided for t in term_set: fg_hit = fg_list.count(t) bg_hit = bg_list.count(t) fg_nohit = nPos - fg_hit bg_nohit = nNeg - bg_hit term_cnt[t] = (fg_hit, fg_hit + bg_hit, stats.getFETpval(fg_hit, bg_hit, fg_nohit, bg_nohit, False)) sorted_cnt = sorted(term_cnt.items(), key=lambda v: v[1][2], reverse=False) ret = [] for t in sorted_cnt: defin = self.getTermdef(t[0]) if defin == None: print 'Could not find definition of %s' % t[0] else: if background != None: ret.append((t[0], t[1][2] * len(term_set), t[1][0], t[1][0]+t[1][1], defin[2], defin[0])) else: ret.append((t[0], t[1], defin[2], defin[0])) return ret
def countWordsReport(seqs, WordWidth = 8, PeakWidth = 100, PeakMargin = 100): """ Produce a report of enriched words of specified length. seqs: DNA sequence data WordWidth: length of sought words PeakWidth: width of window around centre of sequence PeakMargin: the width of the margin on each side of the centre window (which delineates the positives around peak from negatives away from peak). """ pos = RCDict() # reverse complement-aware dictionary for DNA neg = RCDict() # reverse complement-aware dictionary for DNA for seq in seqs: centre = len(seq)/2 # find peak """ Construct all words around peak (positives) and count their presence """ words = set(slidewin(seq[centre-PeakWidth/2:centre+PeakWidth/2], WordWidth)) for word in words: try: pos[word] += 1 except KeyError: pos[word] = 1 """ Construct all words away from peak (negatives) and count """ words = set(slidewin(seq[:centre-PeakWidth/2-PeakMargin], WordWidth)) words.union(slidewin(seq[centre+PeakWidth/2+PeakMargin:], WordWidth)) for word in words: try: neg[word] += 1 except KeyError: neg[word] = 1 logratio = RCDict() # DNA dictionary for storing the log-ration between pos and neg for (word, cnt_pos) in list(pos.items()): cnt_neg = 0.0001 try: cnt_neg = neg[word] except KeyError: pass logratio[word] = math.log(float(cnt_pos) / float(cnt_neg)) allpos = list(logratio.items()) # extract all pairs of words:log-ratio sortpos = sorted(allpos, key=lambda v: v[1], reverse=True) # sort them print("Enriched words (sorted by ln pos/neg)") print("Word \tln pos/neg\tE-value") for (word, lgr) in sortpos[0:100]: # Look at the top-entries according to log-ratio, compute e-values cnt_pos = int(pos[word]) try: cnt_neg = int(neg[word]) except KeyError: cnt_neg = 0 # Compute p-value using Fisher's Exact test pval = stats.getFETpval(cnt_pos, cnt_neg, len(seqs) * (PeakWidth - WordWidth + 1) - cnt_pos, len(seqs) * (len(seq) - (PeakMargin * 2 + PeakWidth) - (WordWidth - 1) * 2) - cnt_neg, False) # Correct for multiple testing (very conservatively) eval = pval * len(allpos) print("%s\t%6.3f \t%e" % (word, lgr, eval))
def get_GO_term_overrepresentation(self, pos_entries, bg_entries = None, evalThreshold = None): if bg_entries == None: bg_entries = self.index.gene2go.keys() bg_entries = set(bg_entries) pos_entries = set(pos_entries) neg_entries = bg_entries - pos_entries # Obtain GO terms for each element in our positive and negative sets pos_terms = [self.find_terms(e) for e in pos_entries] neg_terms = [self.find_terms(e) for e in neg_entries] # Collect all relevant GO terms (those found via the positives) allPos = set() for terms in pos_terms: allPos |= terms # Collect the other GO terms (those found via the negatives) all = set(allPos) for terms in neg_terms: all |= terms nTerms = len(all) # For each term, use Fisher's exact test to establish whether there is # a significant difference between occurrence of a term in each set allHits = {} for seek in allPos: pos_in, pos_out, neg_in, neg_out = 0, 0, 0, 0 for terms in pos_terms: if seek in terms: pos_in += 1 else: pos_out += 1 for terms in neg_terms: if seek in terms: neg_in += 1 else: neg_out += 1 p_value = stats.getFETpval(pos_in, neg_in, pos_out, neg_out, left=False) # Correct for multiple hypothesis testing e_value = p_value * nTerms if (evalThreshold == None or evalThreshold >= e_value): allHits[seek] = e_value return allHits