Пример #1
0
def gsea(homepath):
    '''
		Parameters
		----------
			`homepath` (str): 
			  Path where you want to save all the generated files 
			  and folders. 

		Return:
		-------
			None

		Outputs:
		--------
			Generate a directory names enrichr 
			within home directory and two plot 
			of gene enrichement analysis using 
			the selected genes from panclassif 
	'''
    warnings.filterwarnings("ignore")
    # Directory
    directory = "enrichr"
    # Parent Directory path
    parent_dir = homepath
    # Path
    path = os.path.join(parent_dir, directory)
    if not os.path.exists(path):
        os.mkdir(path)
    gene = pd.read_csv(homepath + "/std_npy/unique_genes_with_frequency.csv",
                       header=None)
    gl = []
    for g in range(len(gene)):
        gl.append(gene[0][g])

    enr = gs.enrichr(gene_list=gl,
                     description='Disease',
                     gene_sets='DisGeNET',
                     outdir=homepath + '/enrichr')
    # simple plotting function
    from gseapy.plot import barplot, dotplot

    # to save your figure, make sure that ``ofname`` is not None
    barplot(enr.res2d,
            title='DisGeNET',
            cutoff=0.2,
            ofname=homepath + '/enrichr/DisGeNET_barplot.png')
    dotplot(enr.res2d,
            title='DisGeNET',
            cmap='viridis_r',
            cutoff=0.2,
            ofname=homepath + '/enrichr/DisGeNET_dotplot.png')
Пример #2
0
    def run(self):
        """run enrichr for one sample gene list but multi-libraries"""

        # read input file
        genes_list = self.parse_genelists()
        gss = self.parse_genesets()
        # if gmt
        self._logger.info(
            "Connecting to Enrichr Server to get latest library names")
        if len(gss) < 1:
            self._logger.error("Hint: Current organism = %s, is this correct?\n"%self.organism +\
                            "Hint: use get_library_name() to view full list of supported names.")
            raise LookupError(
                "Not validated Enrichr library! Please provide correct organism and library name!"
            )
        self.results = pd.DataFrame()

        for g in gss:
            if isinstance(g, dict):
                ## local mode
                res = self.enrich(g)
                shortID, self._gs = str(id(g)), "CUSTOM%s" % id(g)
                if res is None:
                    self._logger.info(
                        "No hits return, for gene set: Custom%s" % shortID)
                    continue
            else:
                ## online mode
                self._gs = str(g)
                self._logger.debug("Start Enrichr using library: %s" %
                                   (self._gs))
                self._logger.info('Analysis name: %s, Enrichr Library: %s' %
                                  (self.descriptions, self._gs))
                shortID, res = self.get_results(genes_list)
                # Remember gene set library used
            res.insert(0, "Gene_set", self._gs)
            # Append to master dataframe
            self.results = self.results.append(res, ignore_index=True)
            self.res2d = res
            if self._outdir is None: continue
            self._logger.info('Save file of enrichment results: Job Id:' +
                              str(shortID))
            outfile = "%s/%s.%s.%s.reports.txt" % (self.outdir, self._gs,
                                                   self.organism, self.module)
            self.res2d.to_csv(outfile, index=False, encoding='utf-8', sep="\t")
            # plotting
            if not self.__no_plot:
                msg = barplot(df=res,
                              cutoff=self.cutoff,
                              figsize=self.figsize,
                              top_term=self.__top_term,
                              color='salmon',
                              title=self._gs,
                              ofname=outfile.replace("txt", self.format))
                if msg is not None: self._logger.warning(msg)
            self._logger.info('Done.\n')
        # clean up tmpdir
        if self._outdir is None: self._tmpdir.cleanup()

        return
Пример #3
0
    def run(self):
        """run enrichr for one sample gene list but multi-libraries"""

        # read input file
        genes_list = self.parse_genelists()
        gss = unique(self.parse_genesets())
        self._logger.info(
            "Connecting to Enrichr Server to get latest library names")
        # gss = self.gene_sets.split(",")
        enrichr_library = get_library_name()
        gss = [g for g in gss if g in enrichr_library]
        self._logger.info("Libraries are used: %s" % ("',".join(gss)))
        if len(gss) < 1:
            sys.stderr.write("Not validated Enrichr library name provided\n")
            sys.stdout.write(
                "Hint: use get_library_name() to view full list of supported names"
            )
            sys.exit(1)
        self.results = pd.DataFrame()
        for g in gss:
            self._gs = str(g)
            self._logger.debug("Start Enrichr using library: %s" % (self._gs))
            self._logger.info('Analysis name: %s, Enrichr Library: %s' %
                              (self.descriptions, self._gs))

            shortID, res = self.get_results(genes_list)
            # Remember gene set library used
            res.insert(0, "Gene_set", self._gs)
            # Append to master dataframe
            self.results = self.results.append(res, ignore_index=True)
            self.res2d = res
            if self._outdir is None: continue
            self._logger.info('Save file of enrichment results: Job Id:' +
                              str(shortID))
            outfile = "%s/%s.%s.%s.reports.txt" % (
                self.outdir, self._gs, self.descriptions, self.module)
            self.res2d.to_csv(outfile, index=False, encoding='utf-8', sep="\t")
            # plotting
            if not self.__no_plot:
                msg = barplot(df=res,
                              cutoff=self.cutoff,
                              figsize=self.figsize,
                              top_term=self.__top_term,
                              color='salmon',
                              title=self._gs,
                              ofname=outfile.replace("txt", self.format))
                if msg is not None: self._logger.warning(msg)
            self._logger.info('Done.\n')
        # clean up tmpdir
        if self._outdir is None: self._tmpdir.cleanup()

        return
Пример #4
0
import sleep
import matplotlib.pyplot as plt
from gseapy.parser import Biomart
from gseapy.plot import barplot, dotplot


gene_list = pd.read_csv("/Users/sunxueyan/Downloads/GSEApy-master/tests/data/gene_list.txt",header=None, sep="\t")
gene_list1 = pd.read_csv("/Users/sunxueyan/Downloads/non_geneID.csv")
gene_list1.head()

glist = gene_list1.squeeze().str.strip().tolist()
names = gp.get_library_name() # default: Human

s = requests.session()
s.keep_alive = False



enr = gp.enrichr(gene_list="/Users/sunxueyan/Downloads/GSEApy-master/tests/data/gene_list.txt",
     # or gene_list=glist
     description='',
     gene_sets=['KEGG_2019_Human'],
     outdir='test/enrichr_kegg',
     cutoff=0.5 # test dataset, use lower value from range(0,1)
    )



barplot(enr.res2d,title='KEGG_2019_Human',)
dotplot(enr.res2d, title='KEGG_2019_Human',)
Пример #5
0
    def run_single(self):
        """run enrichr for one sample"""

        # read input file
        genes_str=self.parse_input()
        
        # name of analysis or list
        description = str(self.descriptions)
        gene_set = str(self._gs)

        self._logger.info("Connecting to Enrichr Server to get latest library names")
        if gene_set in DEFAULT_LIBRARY:
            enrichr_library = DEFAULT_LIBRARY
        else:
            enrichr_library = get_library_name()
            if gene_set not in enrichr_library:
                sys.stderr.write("%s is not a Enrichr library name\n"%gene_set)
                sys.stdout.write("Hint: use get_library_name() to view full list of supported names")
                sys.exit(1)

        self._logger.info('Analysis name: %s, Enrichr Library: %s'%(description, gene_set))

        # enrichr url
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList'
        # payload
        payload = {
          'list': (None, genes_str),
          'description': (None, description)
           }
        # response
        response = requests.post(ENRICHR_URL, files=payload)
        if not response.ok:
            raise Exception('Error analyzing gene list')

        job_id = json.loads(response.text)

        self._logger.debug('Job ID:'+ str(job_id))
        ENRICHR_URL_A = 'http://amp.pharm.mssm.edu/Enrichr/view?userListId=%s'
        user_list_id = job_id['userListId']
        response_gene_list = requests.get(ENRICHR_URL_A % str(user_list_id), timeout=None)
        # wait for 1s
        sleep(1)
        if not response_gene_list.ok:
            raise Exception('Error getting gene list')

        self._logger.info('Submitted gene list:' + str(job_id))
        # Get enrichment results
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
        query_string = '?userListId=%s&backgroundType=%s'
        # get id data
        user_list_id = job_id['userListId']
        response = requests.get(ENRICHR_URL + query_string % (str(user_list_id), gene_set))
        if not response.ok:
            raise Exception('Error fetching enrichment results')

        self._logger.debug('Get enrichment results: Job Id:'+ str(job_id))
        # Download file of enrichment results
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/export'
        query_string = '?userListId=%s&filename=%s&backgroundType=%s'
        user_list_id = str(job_id['userListId'])
        filename = "%s.%s.%s.reports"%(gene_set, description, self.module)
        url = ENRICHR_URL + query_string % (user_list_id, filename, gene_set)

        # set max retries num =5
        s = retry(num=5)
        response = s.get(url, stream=True, timeout=None)

        self._logger.info('Downloading file of enrichment results: Job Id:'+ str(job_id))
        outfile="%s/%s.%s.%s.reports.txt"%(self.outdir, gene_set, description, self.module)

        with open(outfile, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

        self._logger.debug('Results written to: ' + outfile)

        # save results
        df =  read_table(outfile)
        self.res2d = df

        if self._outdir is None: return
        # plotting
        if not self.__no_plot:
            fig = barplot(df=df, cutoff=self.cutoff,
                          figsize=self.figsize, 
                          top_term=self.__top_term,
                          color='salmon',
                          title='')
            if fig is None:
                self._logger.warning("Warning: No enrich terms using library %s when cuttoff = %s"%(gene_set, self.cutoff))
            else:
                fig.savefig(outfile.replace("txt", self.format),
                            bbox_inches='tight', dpi=300)
        self._logger.info('Done.\n')
        return
Пример #6
0
    def run(self):
        """run enrichr"""

        mkdirs(self.outdir)

        #read input file
        genes_str = self.parse_input()

        # name of analysis or list
        description = str(self.descriptions)

        #library validaty confirmationi
        gene_set = str(self.gene_sets)
        #logging start
        logger = self._log_init(
            module=self.module,
            log_level=logging.INFO if self.verbose else logging.WARNING)

        logger.info("Connecting to Enrichr Server to get latest library names")
        if gene_set in DEFAULT_LIBRARY:
            enrichr_library = DEFAULT_LIBRARY
        else:
            enrichr_library = self.get_libraries()
            if gene_set not in enrichr_library:
                sys.stderr.write("%s is not a enrichr library name\n" %
                                 gene_set)
                sys.stdout.write(
                    "Hint: use get_library_name() to veiw full list of supported names"
                )
                sys.exit(1)

        logger.info('Analysis name: %s, Enrichr Library: %s' %
                    (description, gene_set))

        ## enrichr url
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList'
        # payload
        payload = {
            'list': (None, genes_str),
            'description': (None, description)
        }
        # response
        response = requests.post(ENRICHR_URL, files=payload)
        if not response.ok:
            raise Exception('Error analyzing gene list')

        sleep(1)
        job_id = json.loads(response.text)

        logger.debug('Job ID:' + str(job_id))
        ENRICHR_URL_A = 'http://amp.pharm.mssm.edu/Enrichr/view?userListId=%s'
        user_list_id = job_id['userListId']
        response_gene_list = requests.get(ENRICHR_URL_A % str(user_list_id),
                                          timeout=None)

        if not response_gene_list.ok:
            raise Exception('Error getting gene list')

        logger.info('Submitted gene list:' + str(job_id))
        # Get enrichment results
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
        query_string = '?userListId=%s&backgroundType=%s'
        ## get id data
        user_list_id = job_id['userListId']
        response = requests.get(ENRICHR_URL + query_string %
                                (str(user_list_id), gene_set))
        if not response.ok:
            raise Exception('Error fetching enrichment results')

        logger.debug('Get enrichment results: Job Id:' + str(job_id))
        ## Download file of enrichment results
        ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/export'
        query_string = '?userListId=%s&filename=%s&backgroundType=%s'
        user_list_id = str(job_id['userListId'])
        filename = "%s.%s.%s.reports" % (gene_set, description, self.module)
        url = ENRICHR_URL + query_string % (user_list_id, filename, gene_set)

        # set max retries num =5
        s = requests.Session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[500, 502, 503, 504])
        s.mount('http://', HTTPAdapter(max_retries=retries))
        response = s.get(url, stream=True, timeout=None)

        logger.info('Downloading file of enrichment results: Job Id:' +
                    str(job_id))
        outfile = "%s/%s.%s.%s.reports.txt" % (self.outdir, gene_set,
                                               description, self.module)

        with open(outfile, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

        logger.debug('Results written to: ' + outfile)
        #save results
        df = read_table(outfile)
        self.res2d = df

        #plotting
        if not self.__no_plot:
            fig = barplot(
                df=df,
                cutoff=self.cutoff,
                figsize=self.figsize,
                top_term=self.__top_term,
            )
            if fig is None:
                logger.warning(
                    "Warning: No enrich terms using library %s when cuttoff = %s"
                    % (gene_set, self.cutoff))
            else:
                fig.savefig(outfile.replace("txt", self.format),
                            bbox_inches='tight',
                            dpi=300)
        logger.info('Done.\n')
        return