def query_single_gset(query, all_genes=[], gene_sets=[]): res = calc_pvalues(query, gene_sets, background=all_genes) if len(res) == 0: return 0, None else: set_names, p_vals, overlap_size, gset_size, overlapped_genes = res # by default Benjamini-Hochberg q_vals, rejs = multiple_testing_correction(p_vals) #min_log_q_vals.append(np.log10(min(q_vals))) best_hit_ndx = np.argmin(q_vals) return -np.log10(q_vals[best_hit_ndx]), set_names[best_hit_ndx]
def enrich(self, gmt): """use local mode p = p-value computed using the Fisher exact test (Hypergeometric test) Not implemented here: combine score = log(p)·z see here: http://amp.pharm.mssm.edu/Enrichr/help#background&q=4 columns contain: Term Overlap P-value Adjusted_P-value Genes """ if isscalar(self.background): if isinstance(self.background, int) or self.background.isdigit(): self._bg = int(self.background) elif isinstance(self.background, str): # self.background = set(reduce(lambda x,y: x+y, gmt.values(),[])) self._bg = self.get_background() self._logger.info("Background: found %s genes" % (len(self._bg))) else: raise Exception("Unsupported background data type") else: # handle array object: nd.array, list, tuple, set, Series try: it = iter(self.background) self._bg = set(self.background) except TypeError: self._logger.error("Unsupported background data type") # statistical testing hgtest = list( calc_pvalues(query=self._gls, gene_sets=gmt, background=self._bg)) if len(hgtest) > 0: terms, pvals, olsz, gsetsz, genes = hgtest fdrs, rej = multiple_testing_correction( ps=pvals, alpha=self.cutoff, method='benjamini-hochberg') # save to a dataframe odict = OrderedDict() odict['Term'] = terms odict['Overlap'] = list( map(lambda h, g: "%s/%s" % (h, g), olsz, gsetsz)) odict['P-value'] = pvals odict['Adjusted P-value'] = fdrs # odict['Reject (FDR< %s)'%self.cutoff ] = rej odict['Genes'] = [";".join(g) for g in genes] res = pd.DataFrame(odict) return res return
def enrich(self, gmt): """use local mode p = p-value computed using the Fisher exact test (Hypergeometric test) Not implemented here: combine score = log(p)·z see here: http://amp.pharm.mssm.edu/Enrichr/help#background&q=4 columns contain: Term Overlap P-value Adjusted_P-value Genes """ if isinstance(self.background, str): # self.background = set(reduce(lambda x,y: x+y, gmt.values(),[])) df = self.get_background() # input id type: entrez or gene_name if self._isezid: bg = df['entrezgene'].astype(int) else: bg = df['external_gene_name'] self._bg = set(bg.unique()) self._logger.warning("Background: %s %s genes with GO_IDs. " % (self._bg, self.background)) self._logger.warning( "If this is not you wanted, please give a number to background argument" ) hgtest = list( calc_pvalues(query=self._gls, gene_sets=gmt, background=self._bg)) if len(hgtest) > 0: terms, pvals, olsz, gsetsz, genes = hgtest fdrs, rej = multiple_testing_correction( ps=pvals, alpha=self.cutoff, method='benjamini-hochberg') # save to a dataframe odict = OrderedDict() odict['Term'] = terms odict['Overlap'] = list( map(lambda h, g: "%s/%s" % (h, g), olsz, gsetsz)) odict['P-value'] = pvals odict['Adjusted P-value'] = fdrs # odict['Reject (FDR< %s)'%self.cutoff ] = rej odict['Genes'] = [";".join(g) for g in genes] res = pd.DataFrame(odict) return res return
def query_single_gset(query, all_genes=[], gene_sets=[]): res = calc_pvalues(query, gene_sets, background=all_genes) if len(res) == 0: return 0, None,0 else: set_names, p_vals, overlap_size, gset_size, overlapped_genes = res # by default Benjamini-Hochberg q_vals, rejs = multiple_testing_correction(p_vals) #min_log_q_vals.append(np.log10(min(q_vals))) best_fc = 0 best_hit_ndx = -1 for i in range(0,len(q_vals)): if q_vals[i]<0.05: obs = float(overlap_size[i]) expec = float(gset_size[i])*len(query)/len(all_genes) fc = obs/expec if fc > best_fc: best_fc=fc best_hit_ndx = i if best_hit_ndx == -1: return 0, None,0 else: return -np.log10(q_vals[best_hit_ndx]), set_names[best_hit_ndx], np.log10(best_fc)