def run(self): """main replot function""" assert self.min_size <= self.max_size import glob from bs4 import BeautifulSoup #parsing files....... try: results_path = glob.glob(self.indir + '*/edb/results.edb')[0] rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0] except IndexError as e: logger.debug(e) logger.error("Could not locate GSEA files in the given directory!") sys.exit(1) #extract sample names from .cls file cls_path = glob.glob(self.indir + '*/edb/*.cls') if cls_path: phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results phenoPos, phenoNeg = '', '' #start reploting self.gene_sets = gene_set_path mkdirs(self.outdir) logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) #obtain gene sets gene_set_dict = gsea_gmt_parser(gene_set_path, min_size=self.min_size, max_size=self.max_size) #obtain rank_metrics rank_metric = self._rank_metric(rank_path) correl_vector = rank_metric['rank'].values gene_list = rank_metric['gene_name'] #extract each enriment term in the results.edb files and plot. database = BeautifulSoup(open(results_path), features='xml') length = len(database.findAll('DTG')) for idx in range(length): #extract statistical resutls from results.edb file enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser( results_path, index=idx) gene_set = gene_set_dict.get(enrich_term) #calculate enrichment score RES = enrichment_score( gene_list=gene_list, gene_set=gene_set, weighted_score_type=self.weighted_score_type, correl_vector=correl_vector)[2] #plotting gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES, phenoPos, phenoNeg, self.figsize, self.format, self.outdir, self.module) logger.info( "Congratulations! Your plots have been reproduced successfully!")
def run(self): """main replot function""" assert self.min_size <= self.max_size assert self.fignum > 0 import glob from bs4 import BeautifulSoup # parsing files....... try: results_path = glob.glob(self.indir + '*/edb/results.edb')[0] rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0] gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0] except IndexError as e: sys.stderr.write( "Could not locate GSEA files in the given directory!") sys.exit(1) # extract sample names from .cls file cls_path = glob.glob(self.indir + '*/edb/*.cls') if cls_path: phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0]) else: # logic for prerank results phenoPos, phenoNeg = '', '' # start reploting self.gene_sets = gene_set_path # obtain gene sets gene_set_dict = self.parse_gmt(gmt=gene_set_path) # obtain rank_metrics rank_metric = self._load_ranking(rank_path) correl_vector = rank_metric.values gene_list = rank_metric.index.values # extract each enriment term in the results.edb files and plot. database = BeautifulSoup(open(results_path), features='xml') length = len(database.findAll('DTG')) fig_num = self.fignum if self.fignum <= length else length for idx in range(fig_num): # extract statistical resutls from results.edb file enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser( results_path, index=idx) gene_set = gene_set_dict.get(enrich_term) # calculate enrichment score RES = enrichment_score( gene_list=gene_list, correl_vector=correl_vector, gene_set=gene_set, weighted_score_type=self.weighted_score_type, nperm=0)[-1] # plotting gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES, phenoPos, phenoNeg, self.figsize, self.format, self.outdir, self.module) self._logger.info( "Congratulations! Your plots have been reproduced successfully!\n")
def runSamples(self, df, gmt=None): """Single Sample GSEA workflow. multiprocessing utility on samples. """ # df.index.values are gene_names # Save each sample results to odict self.resultsOnSamples = OrderedDict() outdir = self.outdir # run ssgsea for gct expression matrix #multi-threading subsets = sorted(gmt.keys()) tempes=[] names=[] rankings=[] pool = Pool(processes=self._processes) for name, ser in df.iteritems(): #prepare input dat = ser.sort_values(ascending=self.ascending) rankings.append(dat) names.append(name) genes_sorted, cor_vec = dat.index.values, dat.values rs = np.random.RandomState(self.seed) # apply_async tempes.append(pool.apply_async(enrichment_score_tensor, args=(genes_sorted, cor_vec, gmt, self.weighted_score_type, self.permutation_num, rs, True, self.scale))) pool.close() pool.join() # save results and plotting for i, temp in enumerate(tempes): name, rnk = names[i], rankings[i] self._logger.info("Calculate Enrichment Score for Sample: %s "%name) es, esnull, hit_ind, RES = temp.get() # create results subdir self.outdir= os.path.join(outdir, str(name)) mkdirs(self.outdir) # save results self.resultsOnSamples[name] = pd.Series(data=es, index=subsets, name=name) # plotting if self._noplot: continue self._logger.info("Plotting Sample: %s \n" % name) for i, term in enumerate(subsets): gsea_plot(rnk,term, hit_ind[i], es[i], 1, 1, RES[i], '', '', self.figsize, self.format, self.outdir, module=self.module) # save es, nes to file self._save(outdir) return