示例#1
0
    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")
示例#2
0
def get_reaction_ids(keggid):
    """Returns list of kegg reaction IDs for creation of compund with given
     kegg ID"""
    keggparser = KeggParser()
    search = keggparser.get(keggid)
    parsed = keggparser.parse(search)
    if 'reaction' in parsed:
        reaction_ids = []
        reaction_num = parsed['reaction']
        if isinstance(reaction_num, dict):
            for key, value in reaction_num.items():
                reaction_ids.append(key)
                reaction_ids.extend(value.split())
        elif isinstance(reaction_num, str):
            reaction_ids = reaction_num.split()
        return reaction_ids
    else:
        return []
示例#3
0
def get_enzyme_equation(rxnid, equation=True):
    """ Gets the kegg ID, name, and equation of enzyme catalizing reaction in
        rxnid"""
    keggparser = KeggParser()
    search_rxn = keggparser.get(rxnid)
    rxninfo = keggparser.parse(search_rxn)

    if equation and 'equation' in rxninfo:
        equation = rxninfo['equation']
    else:
        raise RuntimeError("Equation unavailable for given reaction")
    if 'name' in rxninfo:
        name = rxninfo['name']
    else:
        raise RuntimeError("Name unavailable for given reaction enzyme")
    if 'enzyme' in rxninfo:
        enzyme = rxninfo['enzyme']
    else:
        raise RuntimeError("Enzyme id unavailable for given reaction")

    if equation:
        return enzyme, name, equation
    else:
        return enzyme, name
示例#4
0
    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()
示例#5
0
    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")
示例#6
0
class KEGGMapper(object):
    """

    """
    kegg_dblinks  = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"]
    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")

    def _get_names(self):
        return self._names
    names = property(_get_names)

    def _get_entries(self):
        return sorted(self.alldata.keys())
    entries = property(_get_entries)

    def build_dataframe(self):

        names = ['class', 'definition', 'disease', 'drug_target',
                'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure']

        N = len(self.entries)
        # build an empty dataframe with relevant names
        data = {}
        # for the dblinks
        for this in self.kegg_dblinks:
            data.update({"%s_kegg" % this: [None] * N})

        # and other interesting entries
        for name in names:
            #e.g. name == position
            data[name] = [self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries]

        df = pd.DataFrame(data, index=self.entries)

        # scan again to fill the df with dblinks
        for index, entry in enumerate(self.entries):
            res = self.alldata[entry]['dblinks']
            for key in res.keys():
                if key in self.kegg_dblinks:
                    # fill df_i,j
                    df.ix[entry][key+"_kegg"] = res[key]
                else:
                    raise NotImplementedError("Found an unknown key in KEGG dblink:%s" % key)
        return df

    def load_all_kegg_entries(self, filename="kegg_gene.dat"):
        if os.path.isfile(filename):
            import pickle
            results = pickle.load(open(filename, "r"))
            return results
        # TODO:
        # donwload from a URL  
        print("could not find kegg data. fetching data from website if possible")
        # Fetches the KEGG results using multicore to send several requests at the same time
        found = self.alldata.keys()
        names = [x for x in self.names if x not in found]
        print("Fetching %s enties" % len(names))


        mc = test_func(names)
        self.mcresults = mc.results.copy()
        # here are the entries to be used as keys
        try:
            entries = ["hsa:"+x['entry'].split()[0] for x in self.mcresults if x]

            for entry, result in zip(entries, self.mcresults):
                self.alldata[entry] = result
        except:
            print("something wrng happened while scaning mcresults")
示例#7
0
def test_KeggParser():
    s = KeggParser()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))
示例#8
0
class KEGGMapper(object):
    """

    """
    kegg_dblinks = [
        "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID",
        "UniProt", "Vega"
    ]

    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")

    def _get_names(self):
        return self._names

    names = property(_get_names)

    def _get_entries(self):
        return sorted(self.alldata.keys())

    entries = property(_get_entries)

    def build_dataframe(self):

        names = [
            'class', 'definition', 'disease', 'drug_target', 'module', 'motif',
            'name', 'orthology', 'pathway', 'position', 'structure'
        ]

        N = len(self.entries)
        # build an empty dataframe with relevant names
        data = {}
        # for the dblinks
        for this in self.kegg_dblinks:
            data.update({"%s_kegg" % this: [None] * N})

        # and other interesting entries
        for name in names:
            #e.g. name == position
            data[name] = [
                self.alldata[entry][name]
                if name in self.alldata[entry].keys() else None
                for entry in self.entries
            ]

        df = pd.DataFrame(data, index=self.entries)

        # scan again to fill the df with dblinks
        for index, entry in enumerate(self.entries):
            res = self.alldata[entry]['dblinks']
            for key in res.keys():
                if key in self.kegg_dblinks:
                    # fill df_i,j
                    df.ix[entry][key + "_kegg"] = res[key]
                else:
                    raise NotImplementedError(
                        "Found an unknown key in KEGG dblink:%s" % key)
        return df

    def load_all_kegg_entries(self, filename="kegg_gene.dat"):
        if os.path.isfile(filename):
            import pickle
            results = pickle.load(open(filename, "r"))
            return results
        # TODO:
        # donwload from a URL
        print(
            "could not find kegg data. fetching data from website if possible")
        # Fetches the KEGG results using multicore to send several requests at the same time
        found = self.alldata.keys()
        names = [x for x in self.names if x not in found]
        print("Fetching %s enties" % len(names))

        mc = test_func(names)
        self.mcresults = mc.results.copy()
        # here are the entries to be used as keys
        try:
            entries = [
                "hsa:" + x['entry'].split()[0] for x in self.mcresults if x
            ]

            for entry, result in zip(entries, self.mcresults):
                self.alldata[entry] = result
        except:
            print("something wrng happened while scaning mcresults")
示例#9
0
        try:
            entries = [
                "hsa:" + x['entry'].split()[0] for x in self.mcresults if x
            ]

            for entry, result in zip(entries, self.mcresults):
                self.alldata[entry] = result
        except:
            print("something wrng happened while scaning mcresults")

        #import pickle
        #pickle.dump(results, open("kegg_gene.dat","w"))


from bioservices import KeggParser
kegg = KeggParser(verbose=False)


def test_func(names):
    from easydev import MultiProcessing
    t = MultiProcessing(verbose=False, maxcpu=8)
    for name in names:
        t.add_job(keggfunc, name)
    t.run()
    return t


def keggfunc(name):
    global kegg
    try:
        id_ = kegg.get(name)
示例#10
0
def test_KeggParser():
    s = KeggParser()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))