示例#1
0
文件: db.py 项目: jolespin/soothsayer
def get_kegg_modules(expand_nested_modules=True):
    results = list()
    for line in pv(list(kegg_list("module")), "Parsing module files"):
        line = line.strip()
        module, name = line.split("\t")
        prefix, id_module = module.split(":")
        module_file = kegg_get(module)
        module_info = parse_kegg_module(module_file)
        results.append(module_info)
    df = pd.DataFrame(results)
    df.index.name = datetime.datetime.now().strftime(
        "Accessed: %Y-%m-%d @ %H:%M [{}]".format(time.tzname[0]))

    # Expand nested modules
    if expand_nested_modules:
        for id_module, row in df.iterrows():
            kegg_orthology_set = row["ORTHOLOGY_SET"]
            expanded = set()
            for x in kegg_orthology_set:
                if x.startswith("K"):
                    expanded.add(x)
                if x.startswith("M"):
                    for id_ko in df.loc[x, "ORTHOLOGY_SET"]:
                        expanded.add(id_ko)
            df.loc[id_module, "ORTHOLOGY_SET"] = expanded
    return df
示例#2
0
def get_kegg_compounds():
    # fill in compounds dictionary (kegg_id: {name: '', pubchem: '', chebi: ''})
    for compound in kegg_list(database='compound'):
        kegg_compounds[compound[4:10]] = {}
        kegg_compounds[compound[4:10]]['name'] = compound.split('\t')[1].split(';')[0].rstrip()
        kegg_compounds[compound[4:10]]['chebi'] = None
        kegg_compounds[compound[4:10]]['pubchem'] = None

    # had to change kegg_conv source code to accept 'chebi' as target db for kegg_conv
    for cpd_id in kegg_conv('chebi', 'compound').read().split('cpd:'):
        if cpd_id != '':
            kegg_compounds[cpd_id[:6]]['chebi'] = cpd_id.split('chebi:')[1].rstrip()
    for cpd_id in kegg_conv('pubchem', 'compound').read().split('cpd:'):
        if cpd_id != '':
            kegg_compounds[cpd_id[:6]]['pubchem'] = cpd_id.split('pubchem:')[1].rstrip()
def getNodeNames(type):
    list = kegg_list(type).read()
    names = dict()
    entries = list.split('\n')

    for entry in entries[:len(entries) - 1]:
        e = entry.split('\t')
        nodeId = e[0][4:]
        lastIndex = e[1].find(';')
        name = e[1][:lastIndex] if lastIndex >= 0 else e[1]
        if type == "hsa":
          lastIndex = name.find(',')
          name = name[:lastIndex] if lastIndex >= 0 else name

        names[nodeId] = name

    return names
def get_node_names(type):
    list = kegg_list(type).read()
    names = dict()
    entries = list.split('\n')

    for entry in entries[:len(entries) - 1]:
        e = entry.split('\t')
        node_id = e[0][4:]
        last_index = e[1].find(';')
        name = e[1][:last_index] if last_index >= 0 else e[1]
        if type == "hsa":
            last_index = name.find(',')
            name = name[:last_index] if last_index >= 0 else name

        names[node_id] = name

    return names
示例#5
0
 def test_list_pathway(self):
     with kegg_list("pathway") as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/list/pathway")
示例#6
0
def list_pathways():
    res = kegg_list('pathway', 'hsa').read()
    return res
parser.add_argument('--clear', action='store_true', help='clear the graph')
parser.add_argument('--commitEvery', type=int, default=100, help='commit every x steps')
args = parser.parse_args()

importer = GraphImporter(args.db, args.commitEvery)
if args.clear or True:
    importer.delete_all()

if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

files = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))]

if len(files) == 0:
    # download kgml files
    res = kegg_list('pathway', 'hsa').read()
    items = res.split('\n')
    for item in items[:len(items) - 1]:
        pathway_id = item[5:13]
        if pathway_id != 'hsa01100':
            print('fetching ' + pathway_id)
            kgml = kegg_get(pathway_id, 'kgml').read()
            with open(args.data_dir + pathway_id + '.kgml', 'w') as text_file:
                text_file.write(kgml)

    files = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f))]


def getNodeNames(type):
    list = kegg_list(type).read()
    names = dict()
示例#8
0
def parse_kegg(org_id, strain, sourcedb, session):
    # get pathways for organism specified by org_id
    pathways = kegg_list(database='pathway', org=org_id).read().split('path:')
    path_ids = []

    # make list of path ids to iterate through
    for path in pathways:
        if path != '':
            path_ids.append(path[:8])

    # iterate through each path and obtain interactions
    for path in path_ids:
        # get kgml representation of path
        kgml_path = read(kegg_get(path, option='kgml'))
        path_name = kgml_path._getname()
        # dictionary of compounds in current path (node_id: kegg_id)
        #   compound._getid() returns node id (only relevant in context of current path)
        #   compound._getname() returns kegg id (relevant in overall KEGG DB)
        compound_ids = {}
        for compound in kgml_path.compounds:
            compound_ids[compound._getid()] = compound._getname()[-6:]
        # go through each relation in path
        for relation in kgml_path.relations:
            relation_type = relation.element.attrib['type']

            # ignore maplink relations
            if relation_type == 'maplink': continue
            # relation._getentry1/2() returns  protein id (locus) or compound id (KEGG id)
            entries = [relation._getentry1()._getname(), relation._getentry2()._getname()]
            # if one or both interactors are listed as undefined, move on to next interaction
            if (entries[0] == 'undefined') | (entries[1] == 'undefined'): continue
            # list to hold existing interactors
            interactors = [[], []]
            # list to hold new metabolite ids for interactions with metabolites not yet in the database
            new_metabolites = [[], []]
            # go through each entry in the relation
            for num in range(0, 2):
                # each entry may contain >1 id; go through all of them
                for id in entries[num].split(' '):
                    if id == '': continue
                    # if interactor is not protein or compound, continue
                    if (id.split(':')[0] != org_id) & (id.split(':')[1] not in kegg_compounds): continue

                    # check if the id is a kegg id by searching in kegg_compounds
                    kegg_id= None
                    if id.split(':')[1] in kegg_compounds:
                        kegg_id = id.split(':')[1]

                    # check if interactor (protein) already exists
                    if (kegg_id is None) & (org_id != 'eco'):
                        interactor = session.query(Interactor).get(id.split(':')[1])
                        if interactor is not None:
                            # make sure to add None value; this will be needed to create interaction reference later
                            # None is appended rather than the interactor id because the interactor is not an ortholog
                            interactors[num].append([interactor, None])
                    # if it doesnt exist, it's not a valid protein, so check if it is a valid compound
                    elif kegg_id is not None:
                        interactor = session.query(Metabolite).filter_by(kegg = kegg_id).first()
                        # if metabolite with id was not found, append the kegg_id to new_metabolites to create
                        if interactor is None:
                            new_metabolites[num].append(kegg_id)
                        else:
                            # if the metabolite was found, add it to the existing interactor list
                            interactors[num].append([interactor, interactor.id])
                    # if parsing E. coli path, add all orthologs to interactor list
                    elif org_id == 'eco':
                        for ortholog in session.query(OrthologEcoli).filter_by(ortholog_id = id.split(':')[1],
                                                                                strain_protein = strain).all():
                            if ortholog is not None:
                                # add the id of the ecoli protein for the interaction reference later
                                interactors[num].append([ortholog.protein, id.split(':')[1]])

            # create list of interactor pairs from two separate lists
            interactor_pairs = []
            # create interactor pairs from interactors which already exist in db
            for interactor1 in interactors[0]:
                for interactor2 in interactors[1]:
                    if (interactor1[0].type != 'm') | (interactor2[0].type != 'm'):
                        interactor_pairs.append([interactor1, interactor2])
            # create interactor pair from interactors and new metabolites
            for interactor1 in interactors[0]:
                for id in new_metabolites[1]:
                    # ignore interactor pairs which would result in m-m interactions
                    if interactor1[0].type == 'm': continue
                    # Note: can query metabolite with kegg only because we updated the metabolite info first
                    metabolite = session.query(Metabolite).filter_by(kegg = id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'],
                                                chebi = kegg_compounds[id]['chebi'])
                        session.add(metabolite)
                    interactor_pairs.append([interactor1, [metabolite, metabolite.id]])
            for interactor1 in interactors[1]:
                for id in new_metabolites[0]:
                    if interactor1[0].type == 'm': continue
                    metabolite = session.query(Metabolite).filter_by(kegg = id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'],
                                                chebi = kegg_compounds[id]['chebi'])
                        session.add(metabolite)
                    interactor_pairs.append([interactor1, [metabolite, metabolite.id]])

            # if no interactor pairs were found, move on the the next interaction
            if len(interactor_pairs) == 0: continue

            # get all intermediates in reaction of type compound
            intermeds = []
            for subtype in relation.element.iter(tag='subtype'):
                # if the subtype element is a compound, get its node id
                if 'compound' in subtype.attrib:
                    compound_node_id = subtype.attrib['compound']
                    if compound_node_id is None: continue
                    # if the node id was not stored in the compound ids for this path, move on to the next sybtype
                    if int(compound_node_id) not in compound_ids: continue
                    # if compound id is valid, either add existing matching metabolite or create new one and add
                    kegg_id = compound_ids[int(compound_node_id)]
                    metabolite = session.query(Metabolite).filter_by(kegg = kegg_id).first()
                    if metabolite is None:
                        metabolite = Metabolite(id=kegg_id, name=kegg_compounds[kegg_id]['name'],
                                                pubchem=kegg_compounds[kegg_id]['pubchem'],
                                                chebi=kegg_compounds[kegg_id]['chebi'], kegg=kegg_id)
                        session.add(metabolite)
                    intermeds.append([metabolite, metabolite.id])

            # add protein - intermediate interactor pairs
            for interactor_list in interactors:
                for interactor in interactor_list:
                    if interactor[0].type != 'm':
                        for intermed in intermeds:
                            interactor_pairs.append([interactor, intermed])

            # go through each interaction pair and add interaction if it doesnt exist yet
            for interactor_pair in interactor_pairs:
                homogenous = (interactor_pair[0][0] == interactor_pair[1][0])
                interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]),
                                                                Interaction.interactors.contains(interactor_pair[1][0]),
                                                                Interaction.homogenous == homogenous).first()

                source = session.query(InteractionSource).filter_by(data_source=sourcedb).first()
                #create interaction if it doesnt exist yet, add source to its sources if it isn't already
                if interaction is None:
                    interaction = Interaction(type=interactor_pair[0][0].type + '-' + interactor_pair[1][0].type,
                                              strain=strain, homogenous=homogenous,
                                              interactors=[interactor_pair[0][0], interactor_pair[1][0]])
                    interaction.sources.append(source)
                    if org_id == 'eco':
                        interaction.ortholog_derived = 'Ecoli'
                    session.add(interaction), session.commit()
                elif source not in interaction.sources:
                    interaction.sources.append(source)

                # in case the interaction already existed, make sure interactor_a and interactor_b variables for
                # new interaction reference match up with the first and second interactors of the existing
                # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli
                # ortholog if the org id is eco)
                interactor_a, interactor_b = None, None
                if org_id == 'eco':
                    if interaction.interactors[0] == interactor_pair[0][0]:
                        interactor_a = interactor_pair[0][1]
                        interactor_b = interactor_pair[1][1]
                    else:
                        interactor_b = interactor_pair[0][1]
                        interactor_a = interactor_pair[1][1]

                # search for reference
                reference = session.query(InteractionReference).filter_by(source_db='kegg',
                                                                          comment='in ' + path_name + ' path',
                                                                          interactor_a=interactor_a,
                                                                          interactor_b=interactor_b).first()
                # if the reference doesnt exist, create it, add it to the interaction's references and add the source
                # to the reference's sources
                if reference is None:
                    reference = InteractionReference(source_db='kegg', comment='in ' + path_name + ' path',
                                                     interactor_a=interactor_a, interactor_b=interactor_b)
                    interaction.references.append(reference)
                    reference.sources.append(source)
                # if the reference does exist, add it to the interaction's reference list and add the source to the
                # reference's source list if it isn't there already
                else:
                    if interaction not in reference.interactions:
                        reference.interactions.append(interaction)
                    if source not in reference.sources:
                        reference.sources.append(source)

    session.commit()
    print(sourcedb, session.query(Interaction).count())
示例#9
0
 def test_list_cpd_C01290_list_gl_G0009(self):
     with kegg_list(["cpd:C01290", "gl:G00092"]) as handle:
         handle.read()
     self.assertEqual(handle.url,
                      "http://rest.kegg.jp/list/cpd:C01290+gl:G00092")
示例#10
0
 def test_list_T01001(self):
     with kegg_list("T01001") as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/list/T01001")
示例#11
0
 def test_list_T01001(self):
     h = kegg_list("T01001")
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/T01001")
     h.close()
示例#12
0
 def test_list_hsa(self):
     h = kegg_list("hsa")
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/hsa")
     h.close()
示例#13
0
 def test_list_organism(self):
     h = kegg_list("organism")
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/organism")
     h.close()
示例#14
0
 def test_list_pathway(self):
     h = kegg_list("pathway")
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/pathway")
     h.close()
示例#15
0
from Bio.KEGG.REST import kegg_list, kegg_get
from Bio.KEGG.KGML import KGML_parser

from import_utils import add_edge, add_node

datapath = os.path.dirname(os.path.realpath(__file__)) + "/data/"

if not os.path.exists(datapath):
  os.makedirs(datapath)

files = [f for f in listdir(datapath) if isfile(join(datapath, f))]

if len(files) == 0:
  # download kgml files
  res = kegg_list('pathway', 'hsa').read()
  items = res.split("\n")
  for item in items[:len(items) - 1]:
    pathway_id = item[5:13]
    print("fetching " + pathway_id)
    kgml = kegg_get(pathway_id, "kgml").read()
    with open(datapath + pathway_id + ".kgml", "w") as text_file:
      text_file.write(kgml)

  files = [f for f in listdir(datapath) if isfile(join(datapath, f))]


def get_names(names):
  n = names.replace("...", "", -14)
  return n.split(", ")
示例#16
0
 def test_list_organism(self):
     with kegg_list("organism") as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/list/organism")
示例#17
0
 def test_list_hsa(self):
     with kegg_list("hsa") as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/list/hsa")
示例#18
0
 def test_list_hsa_10458_list_ece_Z5100(self):
     h = kegg_list(["hsa:10458", "ece:Z5100"])
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/hsa:10458+ece:Z5100")
     h.close()
示例#19
0
 def test_list_hsa_10458_list_ece_Z5100(self):
     with kegg_list(["hsa:10458", "ece:Z5100"]) as handle:
         handle.read()
     self.assertEqual(handle.url,
                      "http://rest.kegg.jp/list/hsa:10458+ece:Z5100")
示例#20
0
 def test_list_cpd_C01290_list_gl_G0009(self):
     h = kegg_list(["cpd:C01290", "gl:G00092"])
     h.read()
     self.assertEqual(h.url,
                      "http://rest.kegg.jp/list/cpd:C01290+gl:G00092")
     h.close()
示例#21
0
 def test_list_C01290_list_G00092(self):
     with kegg_list(["C01290", "G00092"]) as handle:
         handle.read()
     self.assertEqual(handle.url, "http://rest.kegg.jp/list/C01290+G00092")
示例#22
0
 def test_list_C01290_plus_G00092(self):
     h = kegg_list("C01290+G00092")
     h.read()
     self.assertEqual(h.url, "http://rest.kegg.jp/list/C01290+G00092")
     h.close()