def __init__(self, verbose=True): self._kegg_service = KeggParser(verbose=verbose) print("Loading all gene identifiers for HSA") names = self._kegg_service.list("hsa") names = names.strip().split("\n") names = [x.split("\t")[0] for x in names] self._names = names[:] print("Fetching all data") self.alldata = {} self.load_all_kegg_entries() print("Building the dataframe") try: self.df = self.build_dataframe() except: print("error in build_dataframe")
def get_reaction_ids(keggid): """Returns list of kegg reaction IDs for creation of compund with given kegg ID""" keggparser = KeggParser() search = keggparser.get(keggid) parsed = keggparser.parse(search) if 'reaction' in parsed: reaction_ids = [] reaction_num = parsed['reaction'] if isinstance(reaction_num, dict): for key, value in reaction_num.items(): reaction_ids.append(key) reaction_ids.extend(value.split()) elif isinstance(reaction_num, str): reaction_ids = reaction_num.split() return reaction_ids else: return []
def get_enzyme_equation(rxnid, equation=True): """ Gets the kegg ID, name, and equation of enzyme catalizing reaction in rxnid""" keggparser = KeggParser() search_rxn = keggparser.get(rxnid) rxninfo = keggparser.parse(search_rxn) if equation and 'equation' in rxninfo: equation = rxninfo['equation'] else: raise RuntimeError("Equation unavailable for given reaction") if 'name' in rxninfo: name = rxninfo['name'] else: raise RuntimeError("Name unavailable for given reaction enzyme") if 'enzyme' in rxninfo: enzyme = rxninfo['enzyme'] else: raise RuntimeError("Enzyme id unavailable for given reaction") if equation: return enzyme, name, equation else: return enzyme, name
def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet()
class KEGGMapper(object): """ """ kegg_dblinks = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"] def __init__(self, verbose=True): self._kegg_service = KeggParser(verbose=verbose) print("Loading all gene identifiers for HSA") names = self._kegg_service.list("hsa") names = names.strip().split("\n") names = [x.split("\t")[0] for x in names] self._names = names[:] print("Fetching all data") self.alldata = {} self.load_all_kegg_entries() print("Building the dataframe") try: self.df = self.build_dataframe() except: print("error in build_dataframe") def _get_names(self): return self._names names = property(_get_names) def _get_entries(self): return sorted(self.alldata.keys()) entries = property(_get_entries) def build_dataframe(self): names = ['class', 'definition', 'disease', 'drug_target', 'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure'] N = len(self.entries) # build an empty dataframe with relevant names data = {} # for the dblinks for this in self.kegg_dblinks: data.update({"%s_kegg" % this: [None] * N}) # and other interesting entries for name in names: #e.g. name == position data[name] = [self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries] df = pd.DataFrame(data, index=self.entries) # scan again to fill the df with dblinks for index, entry in enumerate(self.entries): res = self.alldata[entry]['dblinks'] for key in res.keys(): if key in self.kegg_dblinks: # fill df_i,j df.ix[entry][key+"_kegg"] = res[key] else: raise NotImplementedError("Found an unknown key in KEGG dblink:%s" % key) return df def load_all_kegg_entries(self, filename="kegg_gene.dat"): if os.path.isfile(filename): import pickle results = pickle.load(open(filename, "r")) return results # TODO: # donwload from a URL print("could not find kegg data. fetching data from website if possible") # Fetches the KEGG results using multicore to send several requests at the same time found = self.alldata.keys() names = [x for x in self.names if x not in found] print("Fetching %s enties" % len(names)) mc = test_func(names) self.mcresults = mc.results.copy() # here are the entries to be used as keys try: entries = ["hsa:"+x['entry'].split()[0] for x in self.mcresults if x] for entry, result in zip(entries, self.mcresults): self.alldata[entry] = result except: print("something wrng happened while scaning mcresults")
def test_KeggParser(): s = KeggParser() d = s.parse(s.get("cpd:C00001")) d = s.parse(s.get("ds:H00001")) d = s.parse(s.get("dr:D00001")) d = s.parse(s.get("ev:E00001")) d = s.parse(s.get("ec:1.1.1.1")) d = s.parse(s.get("hsa:1525")) d = s.parse(s.get("genome:T00001")) d = s.parse(s.get("gl:G00001")) d = s.parse(s.get("md:hsa_M00554")) d = s.parse(s.get("ko:K00001")) d = s.parse(s.get("path:hsa04914")) d = s.parse(s.get("rc:RC00001")) d = s.parse(s.get("rn:R00001")) d = s.parse(s.get("rp:RP00001"))
class KEGGMapper(object): """ """ kegg_dblinks = [ "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega" ] def __init__(self, verbose=True): self._kegg_service = KeggParser(verbose=verbose) print("Loading all gene identifiers for HSA") names = self._kegg_service.list("hsa") names = names.strip().split("\n") names = [x.split("\t")[0] for x in names] self._names = names[:] print("Fetching all data") self.alldata = {} self.load_all_kegg_entries() print("Building the dataframe") try: self.df = self.build_dataframe() except: print("error in build_dataframe") def _get_names(self): return self._names names = property(_get_names) def _get_entries(self): return sorted(self.alldata.keys()) entries = property(_get_entries) def build_dataframe(self): names = [ 'class', 'definition', 'disease', 'drug_target', 'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure' ] N = len(self.entries) # build an empty dataframe with relevant names data = {} # for the dblinks for this in self.kegg_dblinks: data.update({"%s_kegg" % this: [None] * N}) # and other interesting entries for name in names: #e.g. name == position data[name] = [ self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries ] df = pd.DataFrame(data, index=self.entries) # scan again to fill the df with dblinks for index, entry in enumerate(self.entries): res = self.alldata[entry]['dblinks'] for key in res.keys(): if key in self.kegg_dblinks: # fill df_i,j df.ix[entry][key + "_kegg"] = res[key] else: raise NotImplementedError( "Found an unknown key in KEGG dblink:%s" % key) return df def load_all_kegg_entries(self, filename="kegg_gene.dat"): if os.path.isfile(filename): import pickle results = pickle.load(open(filename, "r")) return results # TODO: # donwload from a URL print( "could not find kegg data. fetching data from website if possible") # Fetches the KEGG results using multicore to send several requests at the same time found = self.alldata.keys() names = [x for x in self.names if x not in found] print("Fetching %s enties" % len(names)) mc = test_func(names) self.mcresults = mc.results.copy() # here are the entries to be used as keys try: entries = [ "hsa:" + x['entry'].split()[0] for x in self.mcresults if x ] for entry, result in zip(entries, self.mcresults): self.alldata[entry] = result except: print("something wrng happened while scaning mcresults")
try: entries = [ "hsa:" + x['entry'].split()[0] for x in self.mcresults if x ] for entry, result in zip(entries, self.mcresults): self.alldata[entry] = result except: print("something wrng happened while scaning mcresults") #import pickle #pickle.dump(results, open("kegg_gene.dat","w")) from bioservices import KeggParser kegg = KeggParser(verbose=False) def test_func(names): from easydev import MultiProcessing t = MultiProcessing(verbose=False, maxcpu=8) for name in names: t.add_job(keggfunc, name) t.run() return t def keggfunc(name): global kegg try: id_ = kegg.get(name)