示例#1
0
    def run(self):
        """GSEA main procedure"""
        assert self.method in ['signal_to_noise', 's2n', 'abs_signal_to_noise', 'abs_s2n',
                               't_test', 'ratio_of_classes', 'diff_of_classes', 'log2_ratio_of_classes']
        assert self.permutation_type in ["phenotype", "gene_set"]
        assert self.min_size <= self.max_size

        # Start Analysis
        self._logger.info("Parsing data files for GSEA.............................")
        # phenotype labels parsing
        phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes)
        # select correct expression genes and values.
        dat, cls_dict = self.load_data(cls_vector)
        # data frame must have length > 1
        assert len(dat) > 1
        # ranking metrics calculation.
        dat2 = ranking_metric(df=dat, method=self.method, pos=phenoPos, neg=phenoNeg,
                              classes= cls_dict, ascending=self.ascending)
        self.ranking = dat2
        # filtering out gene sets and build gene sets dictionary
        gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets)

        self._logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))
        self._logger.info("Start to run GSEA...Might take a while..................")
        # cpu numbers
        self._set_cores()
        # compute ES, NES, pval, FDR, RES
        dataset = dat if self.permutation_type =='phenotype' else dat2
        gsea_results,hit_ind,rank_ES, subsets = gsea_compute_tensor(data=dataset, gmt=gmt, n=self.permutation_num,
                                                             weighted_score_type=self.weighted_score_type,
                                                             permutation_type=self.permutation_type,
                                                             method=self.method,
                                                             pheno_pos=phenoPos, pheno_neg=phenoNeg,
                                                             classes=cls_vector, ascending=self.ascending,
                                                             processes=self._processes, seed=self.seed)
        
        self._logger.info("Start to generate GSEApy reports and figures............")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
        self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module,
                                   gmt=gmt, rank_metric=dat2, permutation_type=self.permutation_type)

        # reorder datarame for heatmap
        self._heatmat(df=dat.loc[dat2.index], classes=cls_vector, 
                      pheno_pos=phenoPos, pheno_neg=phenoNeg)
        # Plotting
        if not self._noplot:
            self._plotting(rank_metric=dat2, results=self.results,
                           graph_num=self.graph_num, outdir=self.outdir,
                           figsize=self.figsize, format=self.format,
                           pheno_pos=phenoPos, pheno_neg=phenoNeg)

        self._logger.info("Congratulations. GSEApy ran successfully.................\n")
        if self._outdir is None:
            self._tmpdir.cleanup()

        return
示例#2
0
    def run(self):
        """GSEA main procedure"""

        assert self.permutation_type in ["phenotype", "gene_set"]
        assert self.min_size <= self.max_size

        if isinstance(self.data, pd.DataFrame):
            df = self.data.copy()
        elif os.path.isfile(self.data):
            df = pd.read_table(self.data, comment='#')
        else:
            raise Exception('Error parsing gene expression dataframe!')
            sys.exit(1)
        #data frame must have lenght > 1
        assert len(df) > 1
        # creat output dirs
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)
        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")

        # phenotype labels parsing
        phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes)
        #select correct expression genes and values.
        dat = self.__drop_dat(df, cls_vector)
        #ranking metrics calculation.
        dat2 = ranking_metric(df=dat,
                              method=self.method,
                              phenoPos=phenoPos,
                              phenoNeg=phenoNeg,
                              classes=cls_vector,
                              ascending=self.ascending)

        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets,
                              min_size=self.min_size,
                              max_size=self.max_size,
                              gene_list=dat2['gene_name'].values)

        logger.info(
            "%04d gene_sets used for further statistical testing....." %
            len(gmt))
        logger.info("Start to run GSEA...Might take a while..................")
        #cpu numbers
        self._set_cores()
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute(
            data=dat,
            n=self.permutation_num,
            gmt=gmt,
            weighted_score_type=self.weighted_score_type,
            permutation_type=self.permutation_type,
            method=self.method,
            phenoPos=phenoPos,
            phenoNeg=phenoNeg,
            classes=cls_vector,
            ascending=self.ascending,
            seed=self.seed,
            processes=self._processes)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
        self._save_results(zipdata=res_zip,
                           outdir=self.outdir,
                           module=self.module,
                           gmt=gmt,
                           rank_metric=dat2,
                           permutation_type="gene_sets")

        #Plotting
        heat_dat = dat.loc[dat2.gene_name]
        self._plotting(rank_metric=dat2,
                       results=self.results,
                       res2d=self.res2d,
                       graph_num=self.graph_num,
                       outdir=self.outdir,
                       figsize=self.figsize,
                       format=self.format,
                       module=self.module,
                       data=heat_dat,
                       classes=cls_vector,
                       phenoPos=phenoPos,
                       phenoNeg=phenoNeg)

        logger.info("Congratulations. GSEApy run successfully................")

        return