def _download_libraries(self, libname): """ download enrichr libraries.""" self._logger.info("Downloading and generating Enrichr library gene sets......") s = retry(5) # queery string ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary' query_string = '?mode=text&libraryName=%s' # get response = s.get( ENRICHR_URL + query_string % libname, timeout=None) if not response.ok: raise Exception('Error fetching enrichment results, check internet connection first.') # reformat to dict and save to disk mkdirs(DEFAULT_CACHE_PATH) genesets_dict = {} outname = "enrichr.%s.gmt"%libname gmtout = open(os.path.join(DEFAULT_CACHE_PATH, outname), "w") for line in response.iter_lines(chunk_size=1024, decode_unicode='utf-8'): line=line.strip() k = line.split("\t")[0] v = list(map(lambda x: x.split(",")[0], line.split("\t")[2:])) genesets_dict.update({ k: v}) outline = "%s\t\t%s\n"%(k, "\t".join(v)) gmtout.write(outline) gmtout.close() return genesets_dict
def run(self): """main replot function""" assert self.min_size <= self.max_size import glob from bs4 import BeautifulSoup #parsing files....... try: results_path = glob.glob(self.indir + '*/edb/results.edb')[0] rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0] except IndexError as e: logger.debug(e) logger.error("Could not locate GSEA files in the given directory!") sys.exit(1) #extract sample names from .cls file cls_path = glob.glob(self.indir + '*/edb/*.cls') if cls_path: phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results phenoPos, phenoNeg = '', '' #start reploting self.gene_sets = gene_set_path mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #obtain gene sets gene_set_dict = gsea_gmt_parser(gene_set_path, min_size=self.min_size, max_size=self.max_size) #obtain rank_metrics rank_metric = self._rank_metric(rank_path) correl_vector = rank_metric['rank'].values gene_list = rank_metric['gene_name'] #extract each enriment term in the results.edb files and plot. database = BeautifulSoup(open(results_path), features='xml') length = len(database.findAll('DTG')) for idx in range(length): #extract statistical resutls from results.edb file enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser( results_path, index=idx) gene_set = gene_set_dict.get(enrich_term) #calculate enrichment score RES = enrichment_score( gene_list=gene_list, gene_set=gene_set, weighted_score_type=self.weighted_score_type, correl_vector=correl_vector)[2] #plotting gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES, phenoPos, phenoNeg, self.figsize, self.format, self.outdir, self.module) logger.info( "Congratulations! Your plots have been reproduced successfully!")
def __init__(self, data, gene_sets, outdir="GSEA_SingleSample", sample_norm_method='rank', min_size=15, max_size=2000, permutation_num=0, weighted_score_type=0.25, scale=True, ascending=False, processes=1, figsize=(7,6), format='pdf', graph_num=20, no_plot=False, seed=None, verbose=False): self.data=data self.gene_sets=gene_sets self.outdir=outdir self.sample_norm_method=sample_norm_method self.weighted_score_type=weighted_score_type self.scale = scale self.min_size=min_size self.max_size=max_size self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0 self.ascending=ascending self.figsize=figsize self.format=format self.graph_num=int(graph_num) self.seed=seed self.verbose=bool(verbose) self.ranking=None self.module='ssgsea' self._processes=processes self._noplot=no_plot # init logger mkdirs(self.outdir) _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt") outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset)) self._logger = log_init(outlog=outlog, log_level=logging.INFO if self.verbose else logging.WARNING)
def __init__(self, data, gene_sets, classes, outdir='GSEA_ouput', min_size=15, max_size=500, permutation_num=1000, weighted_score_type=1, permutation_type='gene_set', method='log2_ratio_of_classes', ascending=False, processes=1, figsize=(6.5,6), format='pdf', graph_num=20, no_plot=False, seed=None, verbose=False): self.data = data self.gene_sets=gene_sets self.classes=classes self.outdir=outdir self.permutation_type=permutation_type self.method=method self.min_size=min_size self.max_size=max_size self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0 self.weighted_score_type=weighted_score_type self.ascending=ascending self._processes=processes self.figsize=figsize self.format=format self.graph_num=int(graph_num) self.seed=seed self.verbose=bool(verbose) self.module='gsea' self.ranking=None self._noplot=no_plot # init logger mkdirs(self.outdir) _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt") outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset)) self._logger = log_init(outlog=outlog, log_level=logging.INFO if self.verbose else logging.WARNING)
def __init__(self, rnk, gene_sets, outdir='GSEA_prerank', pheno_pos='Pos', pheno_neg='Neg', min_size=15, max_size=500, permutation_num=1000, weighted_score_type=1, ascending=False, processes=1, figsize=(6.5,6), format='pdf', graph_num=20, no_plot=False, seed=None, verbose=False): self.rnk =rnk self.gene_sets=gene_sets self.outdir=outdir self.pheno_pos=pheno_pos self.pheno_neg=pheno_neg self.min_size=min_size self.max_size=max_size self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0 self.weighted_score_type=weighted_score_type self.ascending=ascending self.figsize=figsize self.format=format self.graph_num=int(graph_num) self.seed=seed self.verbose=bool(verbose) self.ranking=None self.module='prerank' self._processes=processes self._noplot=no_plot # init logger mkdirs(self.outdir) _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt") outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset)) self._logger = log_init(outlog=outlog, log_level=logging.INFO if self.verbose else logging.WARNING)
def __init__(self, indir, outdir='GSEApy_Replot', weighted_score_type=1, min_size=3, max_size=1000, figsize=(6.5, 6), graph_num=20, format='pdf', verbose=False): self.indir = indir self.outdir = outdir self.weighted_score_type = weighted_score_type self.min_size = min_size self.max_size = max_size self.figsize = figsize self.fignum = int(graph_num) self.format = format self.verbose = bool(verbose) self.module = 'replot' self.gene_sets = None self.ascending = False # init logger mkdirs(self.outdir) outlog = os.path.join(self.outdir, "gseapy.%s.%s.log" % (self.module, "run")) self._logger = log_init( outlog=outlog, log_level=logging.INFO if self.verbose else logging.WARNING)
def query(self, dataset='hsapiens_gene_ensembl', attributes=[], filters={}, filename=None): """mapping ids using BioMart. :param dataset: str, default: 'hsapiens_gene_ensembl' :param attributes: str, list, tuple :param filters: dict, {'filter name': list(filter value)} :param host: www.ensembl.org, asia.ensembl.org, useast.ensembl.org :return: a dataframe contains all attributes you selected. **Note**: it will take a couple of minutes to get the results. A xml template for querying biomart. (see https://gist.github.com/keithshep/7776579) exampleTaxonomy = "mmusculus_gene_ensembl" exampleGene = "ENSMUSG00000086981,ENSMUSG00000086982,ENSMUSG00000086983" urlTemplate = \ '''http://ensembl.org/biomart/martservice?query=''' \ '''<?xml version="1.0" encoding="UTF-8"?>''' \ '''<!DOCTYPE Query>''' \ '''<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">''' \ '''<Dataset name="%s" interface="default"><Filter name="ensembl_gene_id" value="%s"/>''' \ '''<Attribute name="ensembl_gene_id"/><Attribute name="ensembl_transcript_id"/>''' \ '''<Attribute name="transcript_start"/><Attribute name="transcript_end"/>''' \ '''<Attribute name="exon_chrom_start"/><Attribute name="exon_chrom_end"/>''' \ '''</Dataset>''' \ '''</Query>''' exampleURL = urlTemplate % (exampleTaxonomy, exampleGene) req = requests.get(exampleURL, stream=True) """ if not attributes: attributes = ['ensembl_gene_id', 'external_gene_name', 'entrezgene', 'go_id'] # i=0 # while (self.host is None) and (i < 3): # self.host = self.ghosts[i] # i +=1 self.new_query() # 'mmusculus_gene_ensembl' self.add_dataset_to_xml(dataset) for at in attributes: self.add_attribute_to_xml(at) # add filters if filters: for k, v in filters.items(): if isinstance(v, list): v = ",".join(v) self.add_filter_to_xml(k, v) xml_query = self.get_xml() results = super(Biomart, self).query(xml_query) df = pd.read_csv(StringIO(results), header=None, sep="\t", names=attributes, index_col=None) # save file to cache path. if filename is None: mkdirs(DEFAULT_CACHE_PATH) filename = os.path.join(DEFAULT_CACHE_PATH, "{}.background.genes.txt".format(dataset)) df.to_csv(filename, sep="\t", index=False) return df
def runSamplesPermu(self, df, gmt=None): """Single Sample GSEA workflow with permutation procedure""" assert self.min_size <= self.max_size mkdirs(self.outdir) self.resultsOnSamples = OrderedDict() outdir = self.outdir # iter throught each sample for name, ser in df.iteritems(): self.outdir = os.path.join(outdir, str(name)) self._logger.info("Run Sample: %s " % name) mkdirs(self.outdir) # sort ranking values from high to low or reverse dat2 = ser.sort_values(ascending=self.ascending) # reset integer index, or caused unwanted problems # df.reset_index(drop=True, inplace=True) # compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute( data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type='gene_set', method=None, pheno_pos='', pheno_neg='', classes=None, ascending=self.ascending, processes=self._processes, seed=self.seed, single=True, scale=self.scale) # write file res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") self.resultsOnSamples[name] = self.res2d.es # plotting if self._noplot: continue self._logger.info("Plotting Sample: %s \n" % name) self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) # save es, nes to file self._save(outdir) return
def runSamples(self, df, gmt=None): """Single Sample GSEA workflow. multiprocessing utility on samples. """ # df.index.values are gene_names # Save each sample results to odict self.resultsOnSamples = OrderedDict() outdir = self.outdir # run ssgsea for gct expression matrix #multi-threading subsets = sorted(gmt.keys()) tempes=[] names=[] rankings=[] pool = Pool(processes=self._processes) for name, ser in df.iteritems(): #prepare input dat = ser.sort_values(ascending=self.ascending) rankings.append(dat) names.append(name) genes_sorted, cor_vec = dat.index.values, dat.values rs = np.random.RandomState(self.seed) # apply_async tempes.append(pool.apply_async(enrichment_score_tensor, args=(genes_sorted, cor_vec, gmt, self.weighted_score_type, self.permutation_num, rs, True, self.scale))) pool.close() pool.join() # save results and plotting for i, temp in enumerate(tempes): name, rnk = names[i], rankings[i] self._logger.info("Calculate Enrichment Score for Sample: %s "%name) es, esnull, hit_ind, RES = temp.get() # create results subdir self.outdir= os.path.join(outdir, str(name)) mkdirs(self.outdir) # save results self.resultsOnSamples[name] = pd.Series(data=es, index=subsets, name=name) # plotting if self._noplot: continue self._logger.info("Plotting Sample: %s \n" % name) for i, term in enumerate(subsets): term = term.replace('/','_').replace(":","_") outfile = '{0}/{1}.{2}.{3}'.format(self.outdir, term, self.module, self.format) gseaplot(rank_metric=rnk, term=term, hits_indices=hit_ind[i], nes=es[i], pval=1, fdr=1, RES=RES[i], pheno_pos='', pheno_neg='', figsize=self.figsize, ofname=outfile) # save es, nes to file self._save(outdir) return
def run(self): """ """ mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #load data data = self._rank_metric(self.data) # logic to process gct expression matrix if self.ranking is None: #gct expression matrix support for ssGSEA self.runOnSamples(df=data) else: #only for one sample self.runSample(df=data)
def run(self): """GSEA prerank workflow""" assert self.min_size <= self.max_size mkdirs(self.outdir) logger = self._log_init(module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) dat2 = self._rank_metric(self.rnk) assert len(dat2) > 1 #cpu numbers self._set_cores() #Start Analysis logger.info("Parsing data files for GSEA.............................") #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2['gene_name'].values) logger.info("%04d gene_sets used for further statistical testing....."% len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #compute ES, NES, pval, FDR, RES gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type='gene_set', method=None, phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg, classes=None, ascending=self.ascending, seed=self.seed, processes=self._processes, prerank=True) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") #Plotting self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) logger.info("Congratulations. GSEApy run successfully................") return
def run(self): """ """ mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #load data data = self.load_data() # normalized samples, and rank normdat = self.norm_samples(data) # logic to process gct expression matrix if self.ranking is None: #gct expression matrix support for ssGSEA self.runOnSamples(df=normdat) else: #only for one sample self.runSample(df=normdat)
def prepare_outdir(self): """create temp directory.""" self._outdir = self.outdir if self._outdir is None: self._tmpdir = TemporaryDirectory() self.outdir = self._tmpdir.name elif isinstance(self.outdir, str): mkdirs(self.outdir) else: raise Exception("Error parsing outdir: %s"%type(self.outdir)) # handle gmt type if isinstance(self.gene_sets, str): _gset = os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt") elif isinstance(self.gene_sets, dict): _gset = "blank_name" else: raise Exception("Error parsing gene_sets parameter for gene sets") logfile = os.path.join(self.outdir, "gseapy.%s.%s.log" % (self.module, _gset)) return logfile
def runSample(self, df, gmt=None): """Single Sample GSEA workflow""" assert self.min_size <= self.max_size mkdirs(self.outdir) #dat = self._rank_metric(df) #assert len(dat) > 1 #Start Analysis self._logger.info( "Parsing data files for GSEA.............................") #select correct expression genes and values. if isinstance(df, pd.DataFrame): if df.shape[1] == 1: df = df.reset_index() elif isinstance(df, pd.Series): df = df.reset_index() #sort ranking values from high to low or reverse df.sort_values(by=df.columns[1], ascending=self.ascending, inplace=True) df.columns = ['gene_name', 'rank'] df['rank2'] = df['rank'] else: raise Exception('Error parsing gene ranking values!') # revmove rank2 dat2 = df.set_index('gene_name') del dat2['rank2'] #cpu numbers self._set_cores() #filtering out gene sets and build gene sets dictionary if gmt is None: gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2.index.values) self._logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt)) self._logger.info( "Start to run GSEA...Might take a while..................") #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss( data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, seed=self.seed, processes=self._processes) self._logger.info( "Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=df, permutation_type="gene_sets") #Plotting self._plotting(rank_metric=df, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) self._logger.info( "Congratulations. GSEApy run successfully................") return
def run(self): """GSEA main procedure""" assert self.permutation_type in ["phenotype", "gene_set"] assert self.min_size <= self.max_size if isinstance(self.data, pd.DataFrame): df = self.data.copy() elif os.path.isfile(self.data): df = pd.read_table(self.data, comment='#') else: raise Exception('Error parsing gene expression dataframe!') sys.exit(1) #data frame must have lenght > 1 assert len(df) > 1 # creat output dirs mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #Start Analysis logger.info("Parsing data files for GSEA.............................") # phenotype labels parsing phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes) #select correct expression genes and values. dat = self.__drop_dat(df, cls_vector) #ranking metrics calculation. dat2 = ranking_metric(df=dat, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending) #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2['gene_name'].values) logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #cpu numbers self._set_cores() #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute( data=dat, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, permutation_type=self.permutation_type, method=self.method, phenoPos=phenoPos, phenoNeg=phenoNeg, classes=cls_vector, ascending=self.ascending, seed=self.seed, processes=self._processes) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat2, permutation_type="gene_sets") #Plotting heat_dat = dat.loc[dat2.gene_name] self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module, data=heat_dat, classes=cls_vector, phenoPos=phenoPos, phenoNeg=phenoNeg) logger.info("Congratulations. GSEApy run successfully................") return