def run(self): """Single Sample GSEA workflow""" assert self.min_size <= self.max_size mkdirs(self.outdir) logger = self._log_init(module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) dat = self._rank_metric(self.data) assert len(dat) > 1 #cpu numbers self._set_cores() #Start Analysis logger.info("Parsing data files for GSEA.............................") #select correct expression genes and values. dat2 = dat.set_index('gene_name') del dat2['rank2'] #filtering out gene sets and build gene sets dictionary gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2.index.values) logger.info("%04d gene_sets used for further statistical testing....."% len(gmt)) logger.info("Start to run GSEA...Might take a while..................") #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss(data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, seed=self.seed, processes=self._processes) logger.info("Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=dat, permutation_type="gene_sets") #Plotting self._plotting(rank_metric=dat, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) logger.info("Congratulations. GSEApy run successfully................") return
def runSample(self, df, gmt=None): """Single Sample GSEA workflow""" assert self.min_size <= self.max_size mkdirs(self.outdir) #dat = self._rank_metric(df) #assert len(dat) > 1 #Start Analysis self._logger.info( "Parsing data files for GSEA.............................") #select correct expression genes and values. if isinstance(df, pd.DataFrame): if df.shape[1] == 1: df = df.reset_index() elif isinstance(df, pd.Series): df = df.reset_index() #sort ranking values from high to low or reverse df.sort_values(by=df.columns[1], ascending=self.ascending, inplace=True) df.columns = ['gene_name', 'rank'] df['rank2'] = df['rank'] else: raise Exception('Error parsing gene ranking values!') # revmove rank2 dat2 = df.set_index('gene_name') del dat2['rank2'] #cpu numbers self._set_cores() #filtering out gene sets and build gene sets dictionary if gmt is None: gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size, gene_list=dat2.index.values) self._logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt)) self._logger.info( "Start to run GSEA...Might take a while..................") #compute ES, NES, pval, FDR, RES gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss( data=dat2, n=self.permutation_num, gmt=gmt, weighted_score_type=self.weighted_score_type, seed=self.seed, processes=self._processes) self._logger.info( "Start to generate gseapy reports, and produce figures...") res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES) self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module, gmt=gmt, rank_metric=df, permutation_type="gene_sets") #Plotting self._plotting(rank_metric=df, results=self.results, res2d=self.res2d, graph_num=self.graph_num, outdir=self.outdir, figsize=self.figsize, format=self.format, module=self.module) self._logger.info( "Congratulations. GSEApy run successfully................") return