示例#1
0
    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size

        import glob
        from bs4 import BeautifulSoup

        #parsing files.......
        try:
            results_path = glob.glob(self.indir + '*/edb/results.edb')[0]
            rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0]
            gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            logger.debug(e)
            logger.error("Could not locate GSEA files in the given directory!")
            sys.exit(1)
        #extract sample names from .cls file
        cls_path = glob.glob(self.indir + '*/edb/*.cls')
        if cls_path:
            phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            phenoPos, phenoNeg = '', ''
        #start reploting
        self.gene_sets = gene_set_path
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)

        #obtain gene sets
        gene_set_dict = gsea_gmt_parser(gene_set_path,
                                        min_size=self.min_size,
                                        max_size=self.max_size)
        #obtain rank_metrics
        rank_metric = self._rank_metric(rank_path)
        correl_vector = rank_metric['rank'].values
        gene_list = rank_metric['gene_name']
        #extract each enriment term in the results.edb files and plot.
        database = BeautifulSoup(open(results_path), features='xml')
        length = len(database.findAll('DTG'))

        for idx in range(length):
            #extract statistical resutls from results.edb file
            enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser(
                results_path, index=idx)
            gene_set = gene_set_dict.get(enrich_term)
            #calculate enrichment score
            RES = enrichment_score(
                gene_list=gene_list,
                gene_set=gene_set,
                weighted_score_type=self.weighted_score_type,
                correl_vector=correl_vector)[2]
            #plotting
            gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES,
                      phenoPos, phenoNeg, self.figsize, self.format,
                      self.outdir, self.module)

        logger.info(
            "Congratulations! Your plots have been reproduced successfully!")
示例#2
0
    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size
        assert self.fignum > 0
        import glob
        from bs4 import BeautifulSoup

        # parsing files.......
        try:
            results_path = glob.glob(self.indir + '*/edb/results.edb')[0]
            rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0]
            gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            sys.stderr.write(
                "Could not locate GSEA files in the given directory!")
            sys.exit(1)
        # extract sample names from .cls file
        cls_path = glob.glob(self.indir + '*/edb/*.cls')
        if cls_path:
            phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            phenoPos, phenoNeg = '', ''
        # start reploting
        self.gene_sets = gene_set_path
        # obtain gene sets
        gene_set_dict = self.parse_gmt(gmt=gene_set_path)
        # obtain rank_metrics
        rank_metric = self._load_ranking(rank_path)
        correl_vector = rank_metric.values
        gene_list = rank_metric.index.values
        # extract each enriment term in the results.edb files and plot.
        database = BeautifulSoup(open(results_path), features='xml')
        length = len(database.findAll('DTG'))
        fig_num = self.fignum if self.fignum <= length else length
        for idx in range(fig_num):
            # extract statistical resutls from results.edb file
            enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser(
                results_path, index=idx)
            gene_set = gene_set_dict.get(enrich_term)
            # calculate enrichment score
            RES = enrichment_score(
                gene_list=gene_list,
                correl_vector=correl_vector,
                gene_set=gene_set,
                weighted_score_type=self.weighted_score_type,
                nperm=0)[-1]
            # plotting
            gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES,
                      phenoPos, phenoNeg, self.figsize, self.format,
                      self.outdir, self.module)

        self._logger.info(
            "Congratulations! Your plots have been reproduced successfully!\n")
示例#3
0
    def runSamples(self, df, gmt=None):
        """Single Sample GSEA workflow.
           multiprocessing utility on samples.
        """

        # df.index.values are gene_names
        # Save each sample results to odict
        self.resultsOnSamples = OrderedDict()
        outdir = self.outdir
        # run ssgsea for gct expression matrix
        #multi-threading
        subsets = sorted(gmt.keys())
        tempes=[]
        names=[]
        rankings=[]
        pool = Pool(processes=self._processes)
        for name, ser in df.iteritems():
            #prepare input
            dat = ser.sort_values(ascending=self.ascending)
            rankings.append(dat)
            names.append(name)
            genes_sorted, cor_vec = dat.index.values, dat.values
            rs = np.random.RandomState(self.seed)
            # apply_async
            tempes.append(pool.apply_async(enrichment_score_tensor,
                                           args=(genes_sorted, cor_vec, gmt,
                                               self.weighted_score_type,
                                               self.permutation_num, rs, True,
                                               self.scale)))
        pool.close()
        pool.join()
        # save results and plotting
        for i, temp in enumerate(tempes):
            name, rnk = names[i], rankings[i]
            self._logger.info("Calculate Enrichment Score for Sample: %s "%name)
            es, esnull, hit_ind, RES = temp.get()
            # create results subdir
            self.outdir= os.path.join(outdir, str(name))
            mkdirs(self.outdir)
            # save results
            self.resultsOnSamples[name] = pd.Series(data=es, index=subsets, name=name)
            # plotting
            if self._noplot: continue
            self._logger.info("Plotting Sample: %s \n" % name)
            for i, term in enumerate(subsets):
                gsea_plot(rnk,term, hit_ind[i], es[i], 1, 1, RES[i],
                          '', '', self.figsize, self.format,
                          self.outdir, module=self.module)
        # save es, nes to file
        self._save(outdir)

        return