Exemplo n.º 1
0
 def test_get_hsa_10458_list_ece_Z5100_as_ntseq(self):
     with kegg_get(["hsa:10458", "ece:Z5100"], "ntseq") as handle:
         data = SeqIO.parse(handle, "fasta")
         self.assertEqual(
             handle.url,
             "http://rest.kegg.jp/get/hsa:10458+ece:Z5100/ntseq")
         self.assertEqual(len(list(data)), 2)
Exemplo n.º 2
0
 def test_get_hsa_10458_list_ece_Z5100_as_ntseq(self):
     h = kegg_get(["hsa:10458", "ece:Z5100"], "ntseq")
     self.assertEqual(h.url,
                      "http://rest.kegg.jp/get/hsa:10458+ece:Z5100/ntseq")
     data = SeqIO.parse(h, 'fasta')
     self.assertEqual(len(list(data)), 2)
     h.close()
Exemplo n.º 3
0
def get_kegg_ontology(koids):
    r"""
    Get the ontology of proteins

    Arguments
    ----------
        geneid: str
            KEGG gene id, for the gene of interest

    Returns
    -------
        pathways:
            List of pathways (Bio.KEGG.KGML.KGML_pathway.Pathway) in which the input drug is involved
    """
    step = 10
    all_res = {}
    for i in trange(0, len(koids), step):
        try:
            res = kegg_get(koids[i:i + step]).read()
        except HTTPError:
            continue
        for koid, entry in zip(koids[i:i + step], res.split('ENTRY')[1:]):
            stop_text = 'DBLINKS' if 'DBLINKS' in entry else 'GENES'
            start, stop = entry.find('BRITE'), entry.find(stop_text)
            all_res[koid] = entry[start:stop]

    return [all_res.get(code, None) for code in koids]
Exemplo n.º 4
0
def get_kegg_modules(expand_nested_modules=True):
    results = list()
    for line in pv(list(kegg_list("module")), "Parsing module files"):
        line = line.strip()
        module, name = line.split("\t")
        prefix, id_module = module.split(":")
        module_file = kegg_get(module)
        module_info = parse_kegg_module(module_file)
        results.append(module_info)
    df = pd.DataFrame(results)
    df.index.name = datetime.datetime.now().strftime(
        "Accessed: %Y-%m-%d @ %H:%M [{}]".format(time.tzname[0]))

    # Expand nested modules
    if expand_nested_modules:
        for id_module, row in df.iterrows():
            kegg_orthology_set = row["ORTHOLOGY_SET"]
            expanded = set()
            for x in kegg_orthology_set:
                if x.startswith("K"):
                    expanded.add(x)
                if x.startswith("M"):
                    for id_ko in df.loc[x, "ORTHOLOGY_SET"]:
                        expanded.add(id_ko)
            df.loc[id_module, "ORTHOLOGY_SET"] = expanded
    return df
Exemplo n.º 5
0
 def test_get_hsa_10458_plus_ece_Z5100_as_aaseq(self):
     h = kegg_get("hsa:10458+ece:Z5100", "aaseq")
     self.assertEqual(h.url,
                      "http://rest.kegg.jp/get/hsa:10458+ece:Z5100/aaseq")
     data = SeqIO.parse(h, "fasta")
     self.assertEqual(len(list(data)), 2)
     h.close()
Exemplo n.º 6
0
    def test_parser_roundtrip(self):
        """Download a KEGG pathway, write local KGML and check roundtrip."""
        with kegg_get("ko00680", "kgml") as remote_handle:
            pathway = KGML_parser.read(remote_handle)

        with io.StringIO(pathway.get_KGML()) as local_handle:
            roundtrip = KGML_parser.read(local_handle)

        self.assertEqual(pathway.name, roundtrip.name)
        self.assertEqual(len(pathway.relations), len(roundtrip.relations))
Exemplo n.º 7
0
 def parse_enzyme(line):
     enzymes = line.strip().split()
     for e in enzymes:
         enzyme = KeggEnzyme(e)
         try:
             parse_enzyme_file(kegg_get(e), enzyme)
         except HTTPError as error:
             print(
                 "cannot get information for enzyme: {}\nsource line:\n{}\nenzyme list:\n{}"
                 .format(e, line, str(enzymes)))
         else:
             ret.append(enzyme)
Exemplo n.º 8
0
def get_kegg_name(id_name):
    '''
    take a KEGG id, e.g. C00014 or K10534 or R00796
    return its name
    '''
    resp = kegg_get(id_name)
    lines = resp.read().split("\n")
    nameidx = -1
    for idx, line in enumerate(lines):
        if line.startswith("NAME"):
            nameidx = idx
    output = [lines[nameidx][5:].lstrip()]
    for line in lines[nameidx+1:]:
        if line.startswith(" "):
            output = output + [line.lstrip()]
        else:
            break
    for idx, name in enumerate(output):
        output[idx] = name.replace(";","")
    return output[0]
Exemplo n.º 9
0
    def parse_reaction(string, record):
        # a map of reaction to set of orthologs
        rxn_enzymes = {}
        enzymes = {}

        rxns, eq = string.strip().split(None, 1)
        substrates, products = eq.split(' -> ')
        substrates = substrates.split(' + ')
        products = products.split(' + ')
        reactants = {}
        compounds = set()
        for substrate in substrates:
            #if substrate not in record.compounds:
            #    try:
            #        #sub = kegg_compound_read(kegg_get(substrate))
            #    except IndexError as e:
            #        print(substrate)
            #        raise e
            #    record.compounds[substrate] = sub
            reactants[substrate] = -1
        for product in products:
            #if product not in record.compounds:
            #    prod = kegg_compound_read(kegg_get(product))
            #    record.compounds[product] = prod
            reactants[product] = 1

        catalyst = Catalyst()
        for rxn in rxns.split(','):
            enzymes = []
            for r in rxn.split('+'):
                enzymes.extend(parse_reaction_file(kegg_get(rxn)))
            catalyst.add(enzymes)

        #print(catalyst)
        record.reactions.append(
            KeggReaction(reactants,
                         catalysts=(catalyst, ),
                         reversible=1,
                         data=rxns))
Exemplo n.º 10
0
 def test_get_C01290_list_G00092(self):
     h = kegg_get(["C01290", "G00092"])
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/get/C01290+G00092")
     h.close()
Exemplo n.º 11
0
def get_kgml(pathway_id):
    kgml = kegg_get(pathway_id, 'kgml').read()
    return kgml
    importer.delete_all()

if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

files = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))]

if len(files) == 0:
    # download kgml files
    res = kegg_list('pathway', 'hsa').read()
    items = res.split('\n')
    for item in items[:len(items) - 1]:
        pathway_id = item[5:13]
        if pathway_id != 'hsa01100':
            print('fetching ' + pathway_id)
            kgml = kegg_get(pathway_id, 'kgml').read()
            with open(args.data_dir + pathway_id + '.kgml', 'w') as text_file:
                text_file.write(kgml)

    files = [
        f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))
    ]


def get_node_names(type):
    list = kegg_list(type).read()
    names = dict()
    entries = list.split('\n')

    for entry in entries[:len(entries) - 1]:
        e = entry.split('\t')
Exemplo n.º 13
0
datapath = os.path.dirname(os.path.realpath(__file__)) + "/data/"

if not os.path.exists(datapath):
  os.makedirs(datapath)

files = [f for f in listdir(datapath) if isfile(join(datapath, f))]

if len(files) == 0:
  # download kgml files
  res = kegg_list('pathway', 'hsa').read()
  items = res.split("\n")
  for item in items[:len(items) - 1]:
    pathway_id = item[5:13]
    print("fetching " + pathway_id)
    kgml = kegg_get(pathway_id, "kgml").read()
    with open(datapath + pathway_id + ".kgml", "w") as text_file:
      text_file.write(kgml)

  files = [f for f in listdir(datapath) if isfile(join(datapath, f))]


def get_names(names):
  n = names.replace("...", "", -14)
  return n.split(", ")


# def create_pathway_edge(edgeDict, sourceNode, targetNode):
#     edgeID = sourceNode.properties["id"] + "_" + targetNode.properties["id"]
#     # edgeID = sourceNode.body["id"] + "_" + targetNode.body["id"]
#     if edgeID not in edgeDict:
    importer.delete_all()

if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

files = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))]

if len(files) == 0:
    # download kgml files
    res = kegg_list('pathway', 'hsa').read()
    items = res.split('\n')
    for item in items[:len(items) - 1]:
        pathway_id = item[5:13]
        if pathway_id != 'hsa01100':
            print('fetching ' + pathway_id)
            kgml = kegg_get(pathway_id, 'kgml').read()
            with open(args.data_dir + pathway_id + '.kgml', 'w') as text_file:
                text_file.write(kgml)

    files = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))]


def getNodeNames(type):
    list = kegg_list(type).read()
    names = dict()
    entries = list.split('\n')

    for entry in entries[:len(entries) - 1]:
        e = entry.split('\t')
        nodeId = e[0][4:]
        lastIndex = e[1].find(';')
Exemplo n.º 15
0
 def test_get_hsa05130_image(self):
     h = kegg_get("hsa05130", "image")
     data = h.read()
     self.assertEqual(data[:4], b"\x89PNG")
     self.assertEqual(h.url, "http://rest.kegg.jp/get/hsa05130/image")
     h.close()
Exemplo n.º 16
0
 def test_parse_remote_pathway(self):
     """Download a KEGG pathway from the KEGG server and parse KGML."""
     h = kegg_get("ko03070", "kgml")
     pathway = KGML_parser.read(h)
     self.assertEqual(pathway.name, "path:ko03070")
     h.close()
Exemplo n.º 17
0

#pathway作为输入
pathway = input('Iput pathway ID: ')
save_path = input('Input path to save: ')

#用biopython内置函数kegg_link来获取pathway上所有的基因列表,为kegg的ID号
print('Retreving gene list . . .')
gene_list_with_pathway = kegg_link('genes', pathway)
#转换为string格式
gene_list_with_pathway_b2string = http_byte2string(gene_list_with_pathway)
#将string按照\t和\n的分隔符分开
gene_list_split = re.split('\n|\t', gene_list_with_pathway_b2string)
#对list切去偶数index的元素,得到pathway上所有基因的编号
gene_list = gene_list_split[1::2]
#历遍列表中的每个基因 用kegg_get抓取对应的序列,保存在相同的路径
for i in gene_list:
    dir2save = save_path + pathway
    os.makedirs(dir2save, exist_ok=True)
    time.sleep(0.4)
    gene_Seq = kegg_get(i, 'ntseq')
    print('Retreving Sequence . . .')
    gene_Seq_byte2string = http_byte2string(gene_Seq)
    gene_Seq_fasta = open(dir2save + '/' + i + '.fa', 'w')
    print('Writing Files . . .')
    gene_Seq_fasta.writelines(gene_Seq_byte2string)
    gene_Seq_fasta.close()
print('All done!')
'''
用cat 命令在bash下合并所有fasta文件
'''
Exemplo n.º 18
0
 def test_get_hsa_10458_list_ece_Z5100(self):
     with kegg_get(["hsa:10458", "ece:Z5100"]) as handle:
         handle.read()
     self.assertEqual(handle.url,
                      "http://rest.kegg.jp/get/hsa:10458+ece:Z5100")
Exemplo n.º 19
0
 def test_get_cpd_C01290_list_gl_G00092(self):
     with kegg_get(["cpd:C01290", "gl:G00092"]) as handle:
         handle.read()
     self.assertEqual(handle.url,
                      "http://rest.kegg.jp/get/cpd:C01290+gl:G00092")
Exemplo n.º 20
0
 def test_get_cpd_C01290_list_gl_G00092(self):
     h = kegg_get(["cpd:C01290", "gl:G00092"])
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/get/cpd:C01290+gl:G00092")
     h.close()
Exemplo n.º 21
0
#!/usr/bin/env python
import argparse
import pickle
from Bio.KEGG.REST import kegg_get

if __name__ == '__main__':
    from kegg_module import KeggModule, KeggReaction, Catalyst, KeggEnzyme, parse
    parser = argparse.ArgumentParser()
    parser.add_argument("pathway", help="An accession to a kegg module")
    parser.add_argument("outfile",
                        help="output file for the pickled KeggModule object")
    args = parser.parse_args()

    for module in parse(kegg_get(args.pathway)):
        with open(args.outfile, 'wb') as pickled:
            pickle.dump(file=pickled, obj=module, protocol=-1)
Exemplo n.º 22
0
 def test_get_C01290_list_G00092(self):
     with kegg_get(["C01290", "G00092"]) as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/get/C01290+G00092")
Exemplo n.º 23
0
 def test_get_hsa05130_image(self):
     with kegg_get("hsa05130", "image") as handle:
         data = handle.read()
     self.assertEqual(data[:4], b"\x89PNG")
     self.assertEqual(handle.url, "http://rest.kegg.jp/get/hsa05130/image")
Exemplo n.º 24
0
 def test_get_hsa_10458_list_ece_Z5100(self):
     h = kegg_get(["hsa:10458", "ece:Z5100"])
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/get/hsa:10458+ece:Z5100")
     h.close()
Exemplo n.º 25
0
def parse_kegg(org_id, strain, sourcedb, session):
    # get pathways for organism specified by org_id
    pathways = kegg_list(database='pathway', org=org_id).read().split('path:')
    path_ids = []

    # make list of path ids to iterate through
    for path in pathways:
        if path != '':
            path_ids.append(path[:8])

    # iterate through each path and obtain interactions
    for path in path_ids:
        # get kgml representation of path
        kgml_path = read(kegg_get(path, option='kgml'))
        path_name = kgml_path._getname()
        # dictionary of compounds in current path (node_id: kegg_id)
        #   compound._getid() returns node id (only relevant in context of current path)
        #   compound._getname() returns kegg id (relevant in overall KEGG DB)
        compound_ids = {}
        for compound in kgml_path.compounds:
            compound_ids[compound._getid()] = compound._getname()[-6:]
        # go through each relation in path
        for relation in kgml_path.relations:
            relation_type = relation.element.attrib['type']

            # ignore maplink relations
            if relation_type == 'maplink': continue
            # relation._getentry1/2() returns  protein id (locus) or compound id (KEGG id)
            entries = [relation._getentry1()._getname(), relation._getentry2()._getname()]
            # if one or both interactors are listed as undefined, move on to next interaction
            if (entries[0] == 'undefined') | (entries[1] == 'undefined'): continue
            # list to hold existing interactors
            interactors = [[], []]
            # list to hold new metabolite ids for interactions with metabolites not yet in the database
            new_metabolites = [[], []]
            # go through each entry in the relation
            for num in range(0, 2):
                # each entry may contain >1 id; go through all of them
                for id in entries[num].split(' '):
                    if id == '': continue
                    # if interactor is not protein or compound, continue
                    if (id.split(':')[0] != org_id) & (id.split(':')[1] not in kegg_compounds): continue

                    # check if the id is a kegg id by searching in kegg_compounds
                    kegg_id= None
                    if id.split(':')[1] in kegg_compounds:
                        kegg_id = id.split(':')[1]

                    # check if interactor (protein) already exists
                    if (kegg_id is None) & (org_id != 'eco'):
                        interactor = session.query(Interactor).get(id.split(':')[1])
                        if interactor is not None:
                            # make sure to add None value; this will be needed to create interaction reference later
                            # None is appended rather than the interactor id because the interactor is not an ortholog
                            interactors[num].append([interactor, None])
                    # if it doesnt exist, it's not a valid protein, so check if it is a valid compound
                    elif kegg_id is not None:
                        interactor = session.query(Metabolite).filter_by(kegg = kegg_id).first()
                        # if metabolite with id was not found, append the kegg_id to new_metabolites to create
                        if interactor is None:
                            new_metabolites[num].append(kegg_id)
                        else:
                            # if the metabolite was found, add it to the existing interactor list
                            interactors[num].append([interactor, interactor.id])
                    # if parsing E. coli path, add all orthologs to interactor list
                    elif org_id == 'eco':
                        for ortholog in session.query(OrthologEcoli).filter_by(ortholog_id = id.split(':')[1],
                                                                                strain_protein = strain).all():
                            if ortholog is not None:
                                # add the id of the ecoli protein for the interaction reference later
                                interactors[num].append([ortholog.protein, id.split(':')[1]])

            # create list of interactor pairs from two separate lists
            interactor_pairs = []
            # create interactor pairs from interactors which already exist in db
            for interactor1 in interactors[0]:
                for interactor2 in interactors[1]:
                    if (interactor1[0].type != 'm') | (interactor2[0].type != 'm'):
                        interactor_pairs.append([interactor1, interactor2])
            # create interactor pair from interactors and new metabolites
            for interactor1 in interactors[0]:
                for id in new_metabolites[1]:
                    # ignore interactor pairs which would result in m-m interactions
                    if interactor1[0].type == 'm': continue
                    # Note: can query metabolite with kegg only because we updated the metabolite info first
                    metabolite = session.query(Metabolite).filter_by(kegg = id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'],
                                                chebi = kegg_compounds[id]['chebi'])
                        session.add(metabolite)
                    interactor_pairs.append([interactor1, [metabolite, metabolite.id]])
            for interactor1 in interactors[1]:
                for id in new_metabolites[0]:
                    if interactor1[0].type == 'm': continue
                    metabolite = session.query(Metabolite).filter_by(kegg = id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'],
                                                chebi = kegg_compounds[id]['chebi'])
                        session.add(metabolite)
                    interactor_pairs.append([interactor1, [metabolite, metabolite.id]])

            # if no interactor pairs were found, move on the the next interaction
            if len(interactor_pairs) == 0: continue

            # get all intermediates in reaction of type compound
            intermeds = []
            for subtype in relation.element.iter(tag='subtype'):
                # if the subtype element is a compound, get its node id
                if 'compound' in subtype.attrib:
                    compound_node_id = subtype.attrib['compound']
                    if compound_node_id is None: continue
                    # if the node id was not stored in the compound ids for this path, move on to the next sybtype
                    if int(compound_node_id) not in compound_ids: continue
                    # if compound id is valid, either add existing matching metabolite or create new one and add
                    kegg_id = compound_ids[int(compound_node_id)]
                    metabolite = session.query(Metabolite).filter_by(kegg = kegg_id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id=kegg_id, name=kegg_compounds[kegg_id]['name'],
                                                pubchem=kegg_compounds[kegg_id]['pubchem'],
                                                chebi=kegg_compounds[kegg_id]['chebi'], kegg=kegg_id)
                        session.add(metabolite)
                    intermeds.append([metabolite, metabolite.id])

            # add protein - intermediate interactor pairs
            for interactor_list in interactors:
                for interactor in interactor_list:
                    if interactor[0].type != 'm':
                        for intermed in intermeds:
                            interactor_pairs.append([interactor, intermed])

            # go through each interaction pair and add interaction if it doesnt exist yet
            for interactor_pair in interactor_pairs:
                homogenous = (interactor_pair[0][0] == interactor_pair[1][0])
                interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]),
                                                                Interaction.interactors.contains(interactor_pair[1][0]),
                                                                Interaction.homogenous == homogenous).first()

                source = session.query(InteractionSource).filter_by(data_source=sourcedb).first()
                #create interaction if it doesnt exist yet, add source to its sources if it isn't already
                if interaction is None:
                    interaction = Interaction(type=interactor_pair[0][0].type + '-' + interactor_pair[1][0].type,
                                              strain=strain, homogenous=homogenous,
                                              interactors=[interactor_pair[0][0], interactor_pair[1][0]])
                    interaction.sources.append(source)
                    if org_id == 'eco':
                        interaction.ortholog_derived = 'Ecoli'
                    session.add(interaction), session.commit()
                elif source not in interaction.sources:
                    interaction.sources.append(source)

                # in case the interaction already existed, make sure interactor_a and interactor_b variables for
                # new interaction reference match up with the first and second interactors of the existing
                # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli
                # ortholog if the org id is eco)
                interactor_a, interactor_b = None, None
                if org_id == 'eco':
                    if interaction.interactors[0] == interactor_pair[0][0]:
                        interactor_a = interactor_pair[0][1]
                        interactor_b = interactor_pair[1][1]
                    else:
                        interactor_b = interactor_pair[0][1]
                        interactor_a = interactor_pair[1][1]

                # search for reference
                reference = session.query(InteractionReference).filter_by(source_db='kegg',
                                                                          comment='in ' + path_name + ' path',
                                                                          interactor_a=interactor_a,
                                                                          interactor_b=interactor_b).first()
                # if the reference doesnt exist, create it, add it to the interaction's references and add the source
                # to the reference's sources
                if reference is None:
                    reference = InteractionReference(source_db='kegg', comment='in ' + path_name + ' path',
                                                     interactor_a=interactor_a, interactor_b=interactor_b)
                    interaction.references.append(reference)
                    reference.sources.append(source)
                # if the reference does exist, add it to the interaction's reference list and add the source to the
                # reference's source list if it isn't there already
                else:
                    if interaction not in reference.interactions:
                        reference.interactions.append(interaction)
                    if source not in reference.sources:
                        reference.sources.append(source)

    session.commit()
    print(sourcedb, session.query(Interaction).count())
Exemplo n.º 26
0
 def test_parse_remote_pathway(self):
     """Download a KEGG pathway from the KEGG server and parse KGML."""
     with kegg_get("ko03070", "kgml") as handle:
         pathway = KGML_parser.read(handle)
     self.assertEqual(pathway.name, "path:ko03070")
Exemplo n.º 27
0
 def test_get_br_ko00002(self):
     with kegg_get("br:ko00002", "json") as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/get/br:ko00002/json")