示例#1
0
    def _download_libraries(self, libname):
        """ download enrichr libraries."""
        self._logger.info("Downloading and generating Enrichr library gene sets......")
        s = retry(5)
        # queery string
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary'
        query_string = '?mode=text&libraryName=%s'
        # get
        response = s.get( ENRICHR_URL + query_string % libname, timeout=None)
        if not response.ok:
            raise Exception('Error fetching enrichment results, check internet connection first.')
        # reformat to dict and save to disk
        mkdirs(DEFAULT_CACHE_PATH)
        genesets_dict = {}
        outname = "enrichr.%s.gmt"%libname
        gmtout = open(os.path.join(DEFAULT_CACHE_PATH, outname), "w")
        for line in response.iter_lines(chunk_size=1024, decode_unicode='utf-8'):
            line=line.strip()
            k = line.split("\t")[0]
            v = list(map(lambda x: x.split(",")[0], line.split("\t")[2:]))
            genesets_dict.update({ k: v})
            outline = "%s\t\t%s\n"%(k, "\t".join(v))
            gmtout.write(outline)
        gmtout.close()

        return genesets_dict
示例#2
0
    def run(self):
        """main replot function"""
        assert self.min_size <= self.max_size

        import glob
        from bs4 import BeautifulSoup

        #parsing files.......
        try:
            results_path = glob.glob(self.indir + '*/edb/results.edb')[0]
            rank_path = glob.glob(self.indir + '*/edb/*.rnk')[0]
            gene_set_path = glob.glob(self.indir + '*/edb/gene_sets.gmt')[0]
        except IndexError as e:
            logger.debug(e)
            logger.error("Could not locate GSEA files in the given directory!")
            sys.exit(1)
        #extract sample names from .cls file
        cls_path = glob.glob(self.indir + '*/edb/*.cls')
        if cls_path:
            phenoPos, phenoNeg, classes = gsea_cls_parser(cls_path[0])
        else:
            # logic for prerank results
            phenoPos, phenoNeg = '', ''
        #start reploting
        self.gene_sets = gene_set_path
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)

        #obtain gene sets
        gene_set_dict = gsea_gmt_parser(gene_set_path,
                                        min_size=self.min_size,
                                        max_size=self.max_size)
        #obtain rank_metrics
        rank_metric = self._rank_metric(rank_path)
        correl_vector = rank_metric['rank'].values
        gene_list = rank_metric['gene_name']
        #extract each enriment term in the results.edb files and plot.
        database = BeautifulSoup(open(results_path), features='xml')
        length = len(database.findAll('DTG'))

        for idx in range(length):
            #extract statistical resutls from results.edb file
            enrich_term, hit_ind, nes, pval, fdr = gsea_edb_parser(
                results_path, index=idx)
            gene_set = gene_set_dict.get(enrich_term)
            #calculate enrichment score
            RES = enrichment_score(
                gene_list=gene_list,
                gene_set=gene_set,
                weighted_score_type=self.weighted_score_type,
                correl_vector=correl_vector)[2]
            #plotting
            gsea_plot(rank_metric, enrich_term, hit_ind, nes, pval, fdr, RES,
                      phenoPos, phenoNeg, self.figsize, self.format,
                      self.outdir, self.module)

        logger.info(
            "Congratulations! Your plots have been reproduced successfully!")
示例#3
0
文件: gsea.py 项目: cthoyt/GSEApy
 def __init__(self, data, gene_sets, outdir="GSEA_SingleSample", sample_norm_method='rank',
              min_size=15, max_size=2000, permutation_num=0, weighted_score_type=0.25,
              scale=True, ascending=False, processes=1, figsize=(7,6), format='pdf',
              graph_num=20, no_plot=False, seed=None, verbose=False):
     self.data=data
     self.gene_sets=gene_sets
     self.outdir=outdir
     self.sample_norm_method=sample_norm_method
     self.weighted_score_type=weighted_score_type
     self.scale = scale
     self.min_size=min_size
     self.max_size=max_size
     self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0
     self.ascending=ascending
     self.figsize=figsize
     self.format=format
     self.graph_num=int(graph_num)
     self.seed=seed
     self.verbose=bool(verbose)
     self.ranking=None
     self.module='ssgsea'
     self._processes=processes
     self._noplot=no_plot
     # init logger
     mkdirs(self.outdir)
     _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt")
     outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset))
     self._logger = log_init(outlog=outlog,
                             log_level=logging.INFO if self.verbose else logging.WARNING)
示例#4
0
文件: gsea.py 项目: cthoyt/GSEApy
    def __init__(self, data, gene_sets, classes, outdir='GSEA_ouput',
                 min_size=15, max_size=500, permutation_num=1000,
                 weighted_score_type=1, permutation_type='gene_set',
                 method='log2_ratio_of_classes', ascending=False,
                 processes=1, figsize=(6.5,6), format='pdf', graph_num=20,
                 no_plot=False, seed=None, verbose=False):

        self.data = data
        self.gene_sets=gene_sets
        self.classes=classes
        self.outdir=outdir
        self.permutation_type=permutation_type
        self.method=method
        self.min_size=min_size
        self.max_size=max_size
        self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0
        self.weighted_score_type=weighted_score_type
        self.ascending=ascending
        self._processes=processes
        self.figsize=figsize
        self.format=format
        self.graph_num=int(graph_num)
        self.seed=seed
        self.verbose=bool(verbose)
        self.module='gsea'
        self.ranking=None
        self._noplot=no_plot
        # init logger
        mkdirs(self.outdir)
        _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt")
        outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset))
        self._logger = log_init(outlog=outlog,
                                log_level=logging.INFO if self.verbose else logging.WARNING)
示例#5
0
文件: gsea.py 项目: cthoyt/GSEApy
    def __init__(self, rnk, gene_sets, outdir='GSEA_prerank',
                 pheno_pos='Pos', pheno_neg='Neg', min_size=15, max_size=500,
                 permutation_num=1000, weighted_score_type=1,
                 ascending=False, processes=1, figsize=(6.5,6), format='pdf',
                 graph_num=20, no_plot=False, seed=None, verbose=False):

        self.rnk =rnk
        self.gene_sets=gene_sets
        self.outdir=outdir
        self.pheno_pos=pheno_pos
        self.pheno_neg=pheno_neg
        self.min_size=min_size
        self.max_size=max_size
        self.permutation_num=int(permutation_num) if int(permutation_num) > 0 else 0
        self.weighted_score_type=weighted_score_type
        self.ascending=ascending
        self.figsize=figsize
        self.format=format
        self.graph_num=int(graph_num)
        self.seed=seed
        self.verbose=bool(verbose)
        self.ranking=None
        self.module='prerank'
        self._processes=processes
        self._noplot=no_plot
        # init logger
        mkdirs(self.outdir)
        _gset =os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt")
        outlog = os.path.join(self.outdir,"gseapy.%s.%s.log"%(self.module, _gset))
        self._logger = log_init(outlog=outlog,
                                log_level=logging.INFO if self.verbose else logging.WARNING)
示例#6
0
 def __init__(self,
              indir,
              outdir='GSEApy_Replot',
              weighted_score_type=1,
              min_size=3,
              max_size=1000,
              figsize=(6.5, 6),
              graph_num=20,
              format='pdf',
              verbose=False):
     self.indir = indir
     self.outdir = outdir
     self.weighted_score_type = weighted_score_type
     self.min_size = min_size
     self.max_size = max_size
     self.figsize = figsize
     self.fignum = int(graph_num)
     self.format = format
     self.verbose = bool(verbose)
     self.module = 'replot'
     self.gene_sets = None
     self.ascending = False
     # init logger
     mkdirs(self.outdir)
     outlog = os.path.join(self.outdir,
                           "gseapy.%s.%s.log" % (self.module, "run"))
     self._logger = log_init(
         outlog=outlog,
         log_level=logging.INFO if self.verbose else logging.WARNING)
示例#7
0
    def query(self, dataset='hsapiens_gene_ensembl', attributes=[], 
              filters={}, filename=None):
        """mapping ids using BioMart.  

        :param dataset: str, default: 'hsapiens_gene_ensembl'
        :param attributes: str, list, tuple
        :param filters: dict, {'filter name': list(filter value)}
        :param host: www.ensembl.org, asia.ensembl.org, useast.ensembl.org
        :return: a dataframe contains all attributes you selected.

        **Note**: it will take a couple of minutes to get the results.
        A xml template for querying biomart. (see https://gist.github.com/keithshep/7776579)
        
        exampleTaxonomy = "mmusculus_gene_ensembl"
        exampleGene = "ENSMUSG00000086981,ENSMUSG00000086982,ENSMUSG00000086983"
        urlTemplate = \
        '''http://ensembl.org/biomart/martservice?query=''' \
        '''<?xml version="1.0" encoding="UTF-8"?>''' \
        '''<!DOCTYPE Query>''' \
        '''<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">''' \
        '''<Dataset name="%s" interface="default"><Filter name="ensembl_gene_id" value="%s"/>''' \
        '''<Attribute name="ensembl_gene_id"/><Attribute name="ensembl_transcript_id"/>''' \
        '''<Attribute name="transcript_start"/><Attribute name="transcript_end"/>''' \
        '''<Attribute name="exon_chrom_start"/><Attribute name="exon_chrom_end"/>''' \
        '''</Dataset>''' \
        '''</Query>''' 
        
        exampleURL = urlTemplate % (exampleTaxonomy, exampleGene)
        req = requests.get(exampleURL, stream=True)
                   
        """
        if not attributes: 
            attributes = ['ensembl_gene_id', 'external_gene_name', 'entrezgene', 'go_id'] 
        # i=0
        # while (self.host is None) and (i < 3):
        #     self.host = self.ghosts[i]
        #     i +=1 
        self.new_query()
        # 'mmusculus_gene_ensembl'
        self.add_dataset_to_xml(dataset)
        for at in attributes:
            self.add_attribute_to_xml(at)
        # add filters
        if filters:
            for k, v in filters.items(): 
                if isinstance(v, list): v = ",".join(v)
                self.add_filter_to_xml(k, v)

        xml_query = self.get_xml()
        results = super(Biomart, self).query(xml_query)
        df = pd.read_csv(StringIO(results), header=None, sep="\t",
                         names=attributes, index_col=None)
        # save file to cache path.
        if filename is None: 
            mkdirs(DEFAULT_CACHE_PATH)
            filename = os.path.join(DEFAULT_CACHE_PATH, "{}.background.genes.txt".format(dataset))
        df.to_csv(filename, sep="\t", index=False)
      
        return df
示例#8
0
    def runSamplesPermu(self, df, gmt=None):
        """Single Sample GSEA workflow with permutation procedure"""

        assert self.min_size <= self.max_size
        mkdirs(self.outdir)
        self.resultsOnSamples = OrderedDict()
        outdir = self.outdir
        # iter throught each sample
        for name, ser in df.iteritems():
            self.outdir = os.path.join(outdir, str(name))
            self._logger.info("Run Sample: %s " % name)
            mkdirs(self.outdir)
            # sort ranking values from high to low or reverse
            dat2 = ser.sort_values(ascending=self.ascending)
            # reset integer index, or caused unwanted problems
            # df.reset_index(drop=True, inplace=True)

            # compute ES, NES, pval, FDR, RES
            gsea_results, hit_ind, rank_ES, subsets = gsea_compute(
                data=dat2,
                n=self.permutation_num,
                gmt=gmt,
                weighted_score_type=self.weighted_score_type,
                permutation_type='gene_set',
                method=None,
                pheno_pos='',
                pheno_neg='',
                classes=None,
                ascending=self.ascending,
                processes=self._processes,
                seed=self.seed,
                single=True,
                scale=self.scale)

            # write file
            res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
            self._save_results(zipdata=res_zip,
                               outdir=self.outdir,
                               module=self.module,
                               gmt=gmt,
                               rank_metric=dat2,
                               permutation_type="gene_sets")
            self.resultsOnSamples[name] = self.res2d.es
            # plotting
            if self._noplot: continue
            self._logger.info("Plotting Sample: %s \n" % name)
            self._plotting(rank_metric=dat2,
                           results=self.results,
                           res2d=self.res2d,
                           graph_num=self.graph_num,
                           outdir=self.outdir,
                           figsize=self.figsize,
                           format=self.format,
                           module=self.module)

        # save es, nes to file
        self._save(outdir)

        return
示例#9
0
    def runSamples(self, df, gmt=None):
        """Single Sample GSEA workflow.
           multiprocessing utility on samples.
        """

        # df.index.values are gene_names
        # Save each sample results to odict
        self.resultsOnSamples = OrderedDict()
        outdir = self.outdir
        # run ssgsea for gct expression matrix
        #multi-threading
        subsets = sorted(gmt.keys())
        tempes=[]
        names=[]
        rankings=[]
        pool = Pool(processes=self._processes)
        for name, ser in df.iteritems():
            #prepare input
            dat = ser.sort_values(ascending=self.ascending)
            rankings.append(dat)
            names.append(name)
            genes_sorted, cor_vec = dat.index.values, dat.values
            rs = np.random.RandomState(self.seed)
            # apply_async
            tempes.append(pool.apply_async(enrichment_score_tensor,
                                           args=(genes_sorted, cor_vec, gmt,
                                               self.weighted_score_type,
                                               self.permutation_num, rs, True,
                                               self.scale)))
        pool.close()
        pool.join()
        # save results and plotting
        for i, temp in enumerate(tempes):
            name, rnk = names[i], rankings[i]
            self._logger.info("Calculate Enrichment Score for Sample: %s "%name)
            es, esnull, hit_ind, RES = temp.get()
            # create results subdir
            self.outdir= os.path.join(outdir, str(name))
            mkdirs(self.outdir)
            # save results
            self.resultsOnSamples[name] = pd.Series(data=es, index=subsets, name=name)
            # plotting
            if self._noplot: continue
            self._logger.info("Plotting Sample: %s \n" % name)
            for i, term in enumerate(subsets):
                term = term.replace('/','_').replace(":","_")
                outfile = '{0}/{1}.{2}.{3}'.format(self.outdir, term, self.module, self.format)
                gseaplot(rank_metric=rnk, term=term, 
                         hits_indices=hit_ind[i], nes=es[i], pval=1, fdr=1, 
                         RES=RES[i], pheno_pos='', pheno_neg='', 
                         figsize=self.figsize, ofname=outfile)
        # save es, nes to file
        self._save(outdir)

        return
示例#10
0
 def run(self):
     """
     """
     mkdirs(self.outdir)
     logger = self._log_init(
         module=self.module,
         log_level=logging.INFO if self.verbose else logging.WARNING)
     #load data
     data = self._rank_metric(self.data)
     # logic to process gct expression matrix
     if self.ranking is None:
         #gct expression matrix support for ssGSEA
         self.runOnSamples(df=data)
     else:
         #only for one sample
         self.runSample(df=data)
示例#11
0
文件: gsea.py 项目: olgabot/GSEApy
    def run(self):
        """GSEA prerank workflow"""

        assert self.min_size <= self.max_size
        mkdirs(self.outdir)
        logger = self._log_init(module=self.module,
                               log_level=logging.INFO if self.verbose else logging.WARNING)


        dat2 = self._rank_metric(self.rnk)
        assert len(dat2) > 1

        #cpu numbers
        self._set_cores()

        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")

        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets, min_size=self.min_size, max_size=self.max_size,
                              gene_list=dat2['gene_name'].values)
        logger.info("%04d gene_sets used for further statistical testing....."% len(gmt))


        logger.info("Start to run GSEA...Might take a while..................")
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind,rank_ES, subsets = gsea_compute(data=dat2, n=self.permutation_num, gmt=gmt,
                                                              weighted_score_type=self.weighted_score_type,
                                                              permutation_type='gene_set', method=None,
                                                              phenoPos=self.pheno_pos, phenoNeg=self.pheno_neg,
                                                              classes=None, ascending=self.ascending, seed=self.seed,
                                                              processes=self._processes, prerank=True)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)

        self._save_results(zipdata=res_zip, outdir=self.outdir, module=self.module,
                                   gmt=gmt, rank_metric=dat2, permutation_type="gene_sets")

        #Plotting
        self._plotting(rank_metric=dat2, results=self.results, res2d=self.res2d,
                       graph_num=self.graph_num, outdir=self.outdir,
                       figsize=self.figsize, format=self.format, module=self.module)

        logger.info("Congratulations. GSEApy run successfully................")

        return
示例#12
0
文件: gsea.py 项目: ranikay/GSEApy
    def run(self):
        """
        """
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)
        #load data
        data = self.load_data()

        # normalized samples, and rank
        normdat = self.norm_samples(data)

        # logic to process gct expression matrix
        if self.ranking is None:
            #gct expression matrix support for ssGSEA
            self.runOnSamples(df=normdat)
        else:
            #only for one sample
            self.runSample(df=normdat)
示例#13
0
    def prepare_outdir(self):
        """create temp directory."""
        self._outdir = self.outdir
        if self._outdir is None:
            self._tmpdir = TemporaryDirectory()
            self.outdir = self._tmpdir.name
        elif isinstance(self.outdir, str):
            mkdirs(self.outdir)
        else:
            raise Exception("Error parsing outdir: %s"%type(self.outdir))

        # handle gmt type
        if isinstance(self.gene_sets, str):
            _gset = os.path.split(self.gene_sets)[-1].lower().rstrip(".gmt")
        elif isinstance(self.gene_sets, dict):
            _gset = "blank_name"
        else:
            raise Exception("Error parsing gene_sets parameter for gene sets")

        logfile = os.path.join(self.outdir, "gseapy.%s.%s.log" % (self.module, _gset))
        return logfile
示例#14
0
    def runSample(self, df, gmt=None):
        """Single Sample GSEA workflow"""

        assert self.min_size <= self.max_size

        mkdirs(self.outdir)
        #dat = self._rank_metric(df)
        #assert len(dat) > 1
        #Start Analysis
        self._logger.info(
            "Parsing data files for GSEA.............................")
        #select correct expression genes and values.
        if isinstance(df, pd.DataFrame):
            if df.shape[1] == 1:
                df = df.reset_index()
        elif isinstance(df, pd.Series):
            df = df.reset_index()
            #sort ranking values from high to low or reverse
            df.sort_values(by=df.columns[1],
                           ascending=self.ascending,
                           inplace=True)
            df.columns = ['gene_name', 'rank']
            df['rank2'] = df['rank']
        else:
            raise Exception('Error parsing gene ranking values!')
        # revmove rank2
        dat2 = df.set_index('gene_name')
        del dat2['rank2']
        #cpu numbers
        self._set_cores()
        #filtering out gene sets and build gene sets dictionary
        if gmt is None:
            gmt = gsea_gmt_parser(self.gene_sets,
                                  min_size=self.min_size,
                                  max_size=self.max_size,
                                  gene_list=dat2.index.values)
        self._logger.info(
            "%04d gene_sets used for further statistical testing....." %
            len(gmt))
        self._logger.info(
            "Start to run GSEA...Might take a while..................")
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute_ss(
            data=dat2,
            n=self.permutation_num,
            gmt=gmt,
            weighted_score_type=self.weighted_score_type,
            seed=self.seed,
            processes=self._processes)

        self._logger.info(
            "Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)

        self._save_results(zipdata=res_zip,
                           outdir=self.outdir,
                           module=self.module,
                           gmt=gmt,
                           rank_metric=df,
                           permutation_type="gene_sets")

        #Plotting
        self._plotting(rank_metric=df,
                       results=self.results,
                       res2d=self.res2d,
                       graph_num=self.graph_num,
                       outdir=self.outdir,
                       figsize=self.figsize,
                       format=self.format,
                       module=self.module)

        self._logger.info(
            "Congratulations. GSEApy run successfully................")

        return
示例#15
0
    def run(self):
        """GSEA main procedure"""

        assert self.permutation_type in ["phenotype", "gene_set"]
        assert self.min_size <= self.max_size

        if isinstance(self.data, pd.DataFrame):
            df = self.data.copy()
        elif os.path.isfile(self.data):
            df = pd.read_table(self.data, comment='#')
        else:
            raise Exception('Error parsing gene expression dataframe!')
            sys.exit(1)
        #data frame must have lenght > 1
        assert len(df) > 1
        # creat output dirs
        mkdirs(self.outdir)
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)
        #Start Analysis
        logger.info("Parsing data files for GSEA.............................")

        # phenotype labels parsing
        phenoPos, phenoNeg, cls_vector = gsea_cls_parser(self.classes)
        #select correct expression genes and values.
        dat = self.__drop_dat(df, cls_vector)
        #ranking metrics calculation.
        dat2 = ranking_metric(df=dat,
                              method=self.method,
                              phenoPos=phenoPos,
                              phenoNeg=phenoNeg,
                              classes=cls_vector,
                              ascending=self.ascending)

        #filtering out gene sets and build gene sets dictionary
        gmt = gsea_gmt_parser(self.gene_sets,
                              min_size=self.min_size,
                              max_size=self.max_size,
                              gene_list=dat2['gene_name'].values)

        logger.info(
            "%04d gene_sets used for further statistical testing....." %
            len(gmt))
        logger.info("Start to run GSEA...Might take a while..................")
        #cpu numbers
        self._set_cores()
        #compute ES, NES, pval, FDR, RES
        gsea_results, hit_ind, rank_ES, subsets = gsea_compute(
            data=dat,
            n=self.permutation_num,
            gmt=gmt,
            weighted_score_type=self.weighted_score_type,
            permutation_type=self.permutation_type,
            method=self.method,
            phenoPos=phenoPos,
            phenoNeg=phenoNeg,
            classes=cls_vector,
            ascending=self.ascending,
            seed=self.seed,
            processes=self._processes)

        logger.info("Start to generate gseapy reports, and produce figures...")
        res_zip = zip(subsets, list(gsea_results), hit_ind, rank_ES)
        self._save_results(zipdata=res_zip,
                           outdir=self.outdir,
                           module=self.module,
                           gmt=gmt,
                           rank_metric=dat2,
                           permutation_type="gene_sets")

        #Plotting
        heat_dat = dat.loc[dat2.gene_name]
        self._plotting(rank_metric=dat2,
                       results=self.results,
                       res2d=self.res2d,
                       graph_num=self.graph_num,
                       outdir=self.outdir,
                       figsize=self.figsize,
                       format=self.format,
                       module=self.module,
                       data=heat_dat,
                       classes=cls_vector,
                       phenoPos=phenoPos,
                       phenoNeg=phenoNeg)

        logger.info("Congratulations. GSEApy run successfully................")

        return