示例#1
0
class KEGGMapper(object):
    """

    """
    kegg_dblinks  = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"]
    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")

    def _get_names(self):
        return self._names
    names = property(_get_names)

    def _get_entries(self):
        return sorted(self.alldata.keys())
    entries = property(_get_entries)

    def build_dataframe(self):

        names = ['class', 'definition', 'disease', 'drug_target',
                'module', 'motif', 'name', 'orthology', 'pathway', 'position', 'structure']

        N = len(self.entries)
        # build an empty dataframe with relevant names
        data = {}
        # for the dblinks
        for this in self.kegg_dblinks:
            data.update({"%s_kegg" % this: [None] * N})

        # and other interesting entries
        for name in names:
            #e.g. name == position
            data[name] = [self.alldata[entry][name] if name in self.alldata[entry].keys() else None for entry in self.entries]

        df = pd.DataFrame(data, index=self.entries)

        # scan again to fill the df with dblinks
        for index, entry in enumerate(self.entries):
            res = self.alldata[entry]['dblinks']
            for key in res.keys():
                if key in self.kegg_dblinks:
                    # fill df_i,j
                    df.ix[entry][key+"_kegg"] = res[key]
                else:
                    raise NotImplementedError("Found an unknown key in KEGG dblink:%s" % key)
        return df

    def load_all_kegg_entries(self, filename="kegg_gene.dat"):
        if os.path.isfile(filename):
            import pickle
            results = pickle.load(open(filename, "r"))
            return results
        # TODO:
        # donwload from a URL  
        print("could not find kegg data. fetching data from website if possible")
        # Fetches the KEGG results using multicore to send several requests at the same time
        found = self.alldata.keys()
        names = [x for x in self.names if x not in found]
        print("Fetching %s enties" % len(names))


        mc = test_func(names)
        self.mcresults = mc.results.copy()
        # here are the entries to be used as keys
        try:
            entries = ["hsa:"+x['entry'].split()[0] for x in self.mcresults if x]

            for entry, result in zip(entries, self.mcresults):
                self.alldata[entry] = result
        except:
            print("something wrng happened while scaning mcresults")
示例#2
0
class KEGGMapper(object):
    """

    """
    kegg_dblinks = [
        "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID",
        "UniProt", "Vega"
    ]

    def __init__(self, verbose=True):
        self._kegg_service = KeggParser(verbose=verbose)

        print("Loading all gene identifiers for HSA")
        names = self._kegg_service.list("hsa")
        names = names.strip().split("\n")
        names = [x.split("\t")[0] for x in names]
        self._names = names[:]

        print("Fetching all data")
        self.alldata = {}
        self.load_all_kegg_entries()

        print("Building the dataframe")
        try:
            self.df = self.build_dataframe()
        except:
            print("error in build_dataframe")

    def _get_names(self):
        return self._names

    names = property(_get_names)

    def _get_entries(self):
        return sorted(self.alldata.keys())

    entries = property(_get_entries)

    def build_dataframe(self):

        names = [
            'class', 'definition', 'disease', 'drug_target', 'module', 'motif',
            'name', 'orthology', 'pathway', 'position', 'structure'
        ]

        N = len(self.entries)
        # build an empty dataframe with relevant names
        data = {}
        # for the dblinks
        for this in self.kegg_dblinks:
            data.update({"%s_kegg" % this: [None] * N})

        # and other interesting entries
        for name in names:
            #e.g. name == position
            data[name] = [
                self.alldata[entry][name]
                if name in self.alldata[entry].keys() else None
                for entry in self.entries
            ]

        df = pd.DataFrame(data, index=self.entries)

        # scan again to fill the df with dblinks
        for index, entry in enumerate(self.entries):
            res = self.alldata[entry]['dblinks']
            for key in res.keys():
                if key in self.kegg_dblinks:
                    # fill df_i,j
                    df.ix[entry][key + "_kegg"] = res[key]
                else:
                    raise NotImplementedError(
                        "Found an unknown key in KEGG dblink:%s" % key)
        return df

    def load_all_kegg_entries(self, filename="kegg_gene.dat"):
        if os.path.isfile(filename):
            import pickle
            results = pickle.load(open(filename, "r"))
            return results
        # TODO:
        # donwload from a URL
        print(
            "could not find kegg data. fetching data from website if possible")
        # Fetches the KEGG results using multicore to send several requests at the same time
        found = self.alldata.keys()
        names = [x for x in self.names if x not in found]
        print("Fetching %s enties" % len(names))

        mc = test_func(names)
        self.mcresults = mc.results.copy()
        # here are the entries to be used as keys
        try:
            entries = [
                "hsa:" + x['entry'].split()[0] for x in self.mcresults if x
            ]

            for entry, result in zip(entries, self.mcresults):
                self.alldata[entry] = result
        except:
            print("something wrng happened while scaning mcresults")