def equiv(d, verbose): if str(d) == 'entrez_info': with open('entrez-gene-ids.beleq', 'w') as fp: for gene_id in d.get_eq_values(): uid = uuid.uuid4() fp.write(delim.join((gene_id, str(uid)))+'\n') entrez_eq[gene_id] = uuid.uuid4() make_eq_dict(d) elif str(d) == 'hgnc': with open('hgnc-approved-symbols.beleq', 'w') as fp: for approved_symbol in d.get_eq_values(): if '~withdrawn' in approved_symbol: continue new_id = to_entrez('HGNC:'+approved_symbol) if new_id is None: # keep track of which hgnc genes need new uuids (dont map to entrez) hgnc_list.append(approved_symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((approved_symbol, str(uid)))+'\n') hgnc_eq[approved_symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((approved_symbol, str(uid)))+'\n') hgnc_eq[approved_symbol] = uid elif str(d) == 'mgi': with open('mgi-approved-symbols.beleq', 'w') as fp: for marker_symbol in d.get_eq_values(): new_id = to_entrez('MGI:'+marker_symbol) if new_id is None: # keep track of which genes need new uuids (dont map to entrez) mgi_list.append(marker_symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((marker_symbol, str(uid)))+'\n') mgi_eq[marker_symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((marker_symbol, str(uid)))+'\n') mgi_eq[marker_symbol] = uid elif str(d) == 'rgd': with open('rgd-approved-symbols.beleq', 'w') as fp: for symbol in d.get_eq_values(): new_id = to_entrez('RGD:'+symbol) if new_id is None: # keep track of which genes need new uuids (dont map to entrez) rgd_list.append(symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((symbol, str(uid)))+'\n') rgd_eq[symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((symbol, str(uid)))+'\n') rgd_eq[symbol] = uid elif str(d) == 'swiss': with open('swissprot-entry-names.beleq', 'w') as fp: # dbrefs is a dict, i.e { reference_type : id_of_that_gene} for name, dbrefs, accessions in d.get_eq_values(): target_pool = ['HGNC', 'MGI', 'RGD'] gene_ids = [] alt_ids = [] if dbrefs is not None: for k, v in dbrefs.items(): if k == 'GeneId': gene_ids.extend(v) if k in target_pool: # could be MGI or RGD or HGNC ids alt_ids.extend(v) if len(gene_ids) == 1: temp_id = entrez_eq.get(gene_ids[0]) if temp_id is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid else: uid = entrez_eq.get(gene_ids[0]) fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid elif len(gene_ids) == 0: # are there hgnc, mgi or rgd refs? if len(alt_ids) == 0: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid sp_list.append(name) elif len(alt_ids) == 1: a_id = alt_ids[0] if 'HGNC' in a_id: hgnc_key = namespaces.hgnc_map.get(a_id) uid = hgnc_eq.get(hgnc_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid else: sp_eq[name] = uid elif 'MGI' in a_id: mgi_key = namespaces.mgi_map.get(a_id) uid = mgi_eq.get(mgi_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid else: sp_eq[name] = uid else: rgd_key = namespaces.rgd_map.get(a_id) uid = rgd_eq.get(rgd_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid else: fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid # > 1 alt_id then generate a new uuid else: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid # > 1 entrez id than generate a new uuid else: uid = uuid.uuid4() fp.write(delim.join((name, str(uid)))+'\n') sp_eq[name] = uid # finally, generate .beleq for accession data also build_acc_data(accessions, name) swiss_accessions_eq() elif str(d) == 'affy': with open('affy-probeset-ids.beleq', 'w') as fp: for probe_id, gene_id in d.get_eq_values(): if gene_id is not None and '---' not in gene_id: # need the space before and after '///' because that is how it is parsed. entrez_ids = gene_id.split(' /// ') # for 1 entrez mapping, use the entez uuid if len(entrez_ids) == 1: status = entrez_eq.get(entrez_ids[0]) if status is None: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid)))+'\n') affy_eq[probe_id] = uid else: uid = status fp.write(delim.join((probe_id, str(uid)))+'\n') affy_eq[probe_id] = uid # we have > 1 entrez mapping, resolve to one. else: adjacent_list = [] for entrez_gene in entrez_ids: refstatus = refseq.get(entrez_gene) adjacent_list.append(ref_status.get(refstatus)) # zipping yields a list of tuples like [('5307',0), ('104',2), ('3043',None)] # i.e. [(entrez_id, refseq_status)] list_of_tuples = list(zip(entrez_ids, adjacent_list)) # get rid of all 'None' tuples (No entrez mapping) list_of_tuples = [tup for tup in list_of_tuples if tup[1] is not None] # no mapping, generate new uuid if len(list_of_tuples) == 0: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid)))+'\n') affy_eq[probe_id] = uid # multiple entrez, resolve by refseq status else: # min tuple is highest refseq status (0 the best) min_tuple = min(list_of_tuples, key=lambda x: x[1]) min_refseq = min_tuple[1] lowest_tuples = [] for item in list_of_tuples: if item[1] == min_refseq: lowest_tuples.append(item) # if mutiple genes with same refseq, resolve by lowest gene # target_tuple = min(lowest_tuples) uid = entrez_eq.get(target_tuple[0]) fp.write(delim.join((probe_id, str(uid)))+'\n') affy_eq[probe_id] = uid # no entrez mapping, create a new uuid else: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid)))+'\n') affy_eq[probe_id] = uid # equiv for alt ids and names relies on the equivalence for # primary ids being completely generated. elif str(d) == 'chebi': with open('chebi-ids.beleq', 'w') as fp, open('chebi-names_eq.beleq', 'w') as f: # like Entrez, new uuid for primary ids only the FIRST time. for primary_id in d.get_primary_ids(): uid = uuid.uuid4() fp.write(delim.join((primary_id, str(uid)))+'\n') chebi_id_eq[primary_id] = uid for alt_id in d.get_alt_ids(): if alt_id not in chebi_id_eq: # get its primary equivalent and use its uuid primary = d.alt_to_primary(alt_id) uid = chebi_id_eq[primary] fp.write(delim.join((alt_id, str(uid)))+'\n') chebi_id_eq[alt_id] = uid for name in d.get_names(): primary = d.name_to_primary(name) uid = chebi_id_eq.get(primary) f.write(delim.join((name, str(uid)))+'\n') chebi_name_eq[name] = uid elif str(d) == 'pubchem_equiv': with open('pubchem_eq.beleq', 'w') as fp: for sid, source, cid in d.get_eq_values(): if 'ChEBI' in source and cid is not None: # <-- verify that NO PubChem CID == 'None' # use the CHEBI uuid chebi_equiv = source.split(':')[1] uid = chebi_id_eq.get(chebi_equiv) fp.write(delim.join((sid, str(uid)))+'\n') pub_eq_dict[sid] = uid else: # generate a new uuid uid = uuid.uuid4() fp.write(delim.join((sid, str(uid)))+'\n') elif str(d) == 'gobp': with open('go-biological-processes-names.beleq', 'w') as gobp, \ open('go-biological-processes-ids.beleq', 'w') as gobp_id: for vals in d.get_eq_values(): termid, termname = vals uid = uuid.uuid4() gobp_id.write(delim.join((termid, str(uid)))+'\n') gobp.write(delim.join((termname, str(uid)))+'\n') gobp_eq_dict[termname] = uid # GO is the baseline for processes, so new uuids the first time. elif str(d) == 'gocc': with open('go-cellular-component-terms.beleq', 'w') as gocc, \ open('go-cellular-component-ids.beleq', 'w') as gocc_id: for vals in d.get_eq_values(): termid, termname = vals uid = uuid.uuid4() gocc_id.write(delim.join((termid, str(uid)))+'\n') gocc.write(delim.join((termname, str(uid)))+'\n') gocc_eq_dict[termid] = uid elif str(d) == 'do': # assign DO a new uuid and use as the primary for diseases with open('disease-ontology-names.beleq', 'w') as dn, \ open('disease-ontology-ids.beleq', 'w') as di: for vals in d.get_eq_values(): name, id = vals uid = uuid.uuid4() dn.write(delim.join((name, str(uid)))+'\n') di.write(delim.join((id, str(uid)))+'\n') do_eq_dict[name] = uid elif str(d) == 'sdis_to_do': # try to resolve sdis terms to DO. If there is not one, # assign a new uuid. count = 0 sdis = parsed.load_data('sdis') with open('selventa-legacy-diseases.beleq', 'w') as dof: for vals in sdis.get_eq_values(): uid = None sdis_term = vals if d.has_equivalence(sdis_term): count = count + 1 do_term = d.get_equivalence(sdis_term) if do_term in do_eq_dict: uid = do_eq_dict[do_term] else: uid = do_eq_dict[do_term.lower()] else: uid = uuid.uuid4() dof.write(delim.join((sdis_term, str(uid)))+'\n') if verbose: print('Able to resolve ' +str(count)+ ' legacy disease terms to DO.') elif str(d) == 'schem_to_chebi': # try to resolve schem terms to CHEBI. If there is not one, # assign a new uuid. count = 0 schem = parsed.load_data('schem') with open('selventa-legacy-chemical-names.beleq', 'w') as schemf: for vals in schem.get_eq_values(): uid = None schem_term = vals if d.has_equivalence(schem_term): count = count + 1 chebi_term = d.get_equivalence(schem_term) if chebi_term in chebi_name_eq: uid = chebi_name_eq[chebi_term] elif chebi_term.lower() in chebi_name_eq: uid = chebi_name_eq[chebi_term.lower()] else: uid = uuid.uuid4() schemf.write(delim.join((schem_term, str(uid)))+'\n') if verbose: print('Able to resolve ' +str(count)+ ' legacy chemical terms to CHEBI.') elif str(d) == 'mesh': with open('mesh-cellular-locations.beleq', 'w') as mesha, \ open('mesh-diseases.beleq', 'w') as meshc, \ open('mesh-biological-processes.beleq', 'w') as meshg: do_data = parsed.load_data('do') for vals in d.get_eq_values(): ui, mh, mns, synonyms = vals if any('A11.284' in branch for branch in mns): # get GO equiv if there is one uid = None go_id = mg_eq.get(mh) # meshcs_to_gocc contains OBSOLETE GO terms at the moment. # It is possible this lookup will return None, in that # case generate a new uuid. if go_id is not None: uid = gocc_eq_dict.get(go_id) # try to find out why lookups fail - maybe OBSOLETE? if uid is None: if verbose: print('Lookup failed for: '+str(go_id)) uid = uuid.uuid4() else: uid = uuid.uuid4() mesha.write(delim.join((mh, str(uid)))+'\n') elif any('C' in branch for branch in mns): # does UI exist as a Xref in DO? xref = do_data.get_xrefs('MSH:'+ui) if xref: uid = do_eq_dict[xref] else: uid = uuid.uuid4() meshc.write(delim.join((mh, str(uid)))+'\n') elif any('G' in branch for branch in mns): # synonyms for MeSH uid = None for syn in synonyms: # root 'G' branch in GOBP for name in gobp_eq_dict: if syn.lower() == name.lower(): uid = gobp_eq_dict.get(name) if uid is None: uid = uuid.uuid4() meshg.write(delim.join((mh, str(uid)))+'\n')
for root, dirs, filenames in os.walk(working_dir): for f in filenames: if f in baseline_data: data_tuple = baseline_data.get(f) parser = data_tuple[PARSER_TYPE]("datasets/" + f) if verbose: parser.is_verbose() print("Running " + str(parser)) for x in parser.parse(): parsed.build_data(x, str(parser)) print("Phase II ran in " + str(((time.time() - interval_time) / 60)) + " minutes") print("\n======= Phase III, building namespaces =======") interval_time = time.time() # load parsed data to build namespaces ei = parsed.load_data("entrez_info") eh = parsed.load_data("entrez_history") hg = parsed.load_data("hgnc") mg = parsed.load_data("mgi") rg = parsed.load_data("rgd") sp = parsed.load_data("swiss") af = parsed.load_data("affy") g2 = parsed.load_data("gene2acc") chebi = parsed.load_data("chebi") schem = parsed.load_data("schem") schem_to_chebi = parsed.load_data("schem_to_chebi") sdis = parsed.load_data("sdis") sdis_to_do = parsed.load_data("sdis_to_do") gobp = parsed.load_data("gobp") gocc = parsed.load_data("gocc") # pub_eq = parsed.load_data('pubchem_equiv')
def equiv(d, verbose): if str(d) == "entrez_info": with open("entrez-gene-ids.beleq", "w") as fp: for gene_id in d.get_eq_values(): uid = uuid.uuid4() fp.write(delim.join((gene_id, str(uid))) + "\n") entrez_eq[gene_id] = uuid.uuid4() make_eq_dict(d) elif str(d) == "hgnc": with open("hgnc-approved-symbols.beleq", "w") as fp: for approved_symbol in d.get_eq_values(): if "~withdrawn" in approved_symbol: continue new_id = to_entrez("HGNC:" + approved_symbol) if new_id is None: # keep track of which hgnc genes need new uuids (dont map to entrez) hgnc_list.append(approved_symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((approved_symbol, str(uid))) + "\n") hgnc_eq[approved_symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((approved_symbol, str(uid))) + "\n") hgnc_eq[approved_symbol] = uid elif str(d) == "mgi": with open("mgi-approved-symbols.beleq", "w") as fp: for marker_symbol in d.get_eq_values(): new_id = to_entrez("MGI:" + marker_symbol) if new_id is None: # keep track of which genes need new uuids (dont map to entrez) mgi_list.append(marker_symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((marker_symbol, str(uid))) + "\n") mgi_eq[marker_symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((marker_symbol, str(uid))) + "\n") mgi_eq[marker_symbol] = uid elif str(d) == "rgd": with open("rgd-approved-symbols.beleq", "w") as fp: for symbol in d.get_eq_values(): new_id = to_entrez("RGD:" + symbol) if new_id is None: # keep track of which genes need new uuids (dont map to entrez) rgd_list.append(symbol) # generate new uuid uid = uuid.uuid4() fp.write(delim.join((symbol, str(uid))) + "\n") rgd_eq[symbol] = uid else: # use the entrez uuid uid = entrez_eq.get(new_id) fp.write(delim.join((symbol, str(uid))) + "\n") rgd_eq[symbol] = uid elif str(d) == "swiss": with open("swissprot-entry-names.beleq", "w") as fp: # dbrefs is a dict, i.e { reference_type : id_of_that_gene} for name, dbrefs, accessions in d.get_eq_values(): target_pool = ["HGNC", "MGI", "RGD"] gene_ids = [] alt_ids = [] if dbrefs is not None: for k, v in dbrefs.items(): if k == "GeneId": gene_ids.extend(v) if k in target_pool: # could be MGI or RGD or HGNC ids alt_ids.extend(v) if len(gene_ids) == 1: temp_id = entrez_eq.get(gene_ids[0]) if temp_id is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid else: uid = entrez_eq.get(gene_ids[0]) fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid elif len(gene_ids) == 0: # are there hgnc, mgi or rgd refs? if len(alt_ids) == 0: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid sp_list.append(name) elif len(alt_ids) == 1: a_id = alt_ids[0] if "HGNC" in a_id: hgnc_key = namespaces.hgnc_map.get(a_id) uid = hgnc_eq.get(hgnc_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid else: sp_eq[name] = uid elif "MGI" in a_id: mgi_key = namespaces.mgi_map.get(a_id) uid = mgi_eq.get(mgi_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid else: sp_eq[name] = uid else: rgd_key = namespaces.rgd_map.get(a_id) uid = rgd_eq.get(rgd_key) # SwissProt may be referring to a since-removed gene. if uid is None: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid else: fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid # > 1 alt_id then generate a new uuid else: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid # > 1 entrez id than generate a new uuid else: uid = uuid.uuid4() fp.write(delim.join((name, str(uid))) + "\n") sp_eq[name] = uid # finally, generate .beleq for accession data also build_acc_data(accessions, name) swiss_accessions_eq() elif str(d) == "affy": with open("affy-probeset-ids.beleq", "w") as fp: for probe_id, gene_id in d.get_eq_values(): if gene_id is not None and "---" not in gene_id: # need the space before and after '///' because that is how it is parsed. entrez_ids = gene_id.split(" /// ") # for 1 entrez mapping, use the entez uuid if len(entrez_ids) == 1: status = entrez_eq.get(entrez_ids[0]) if status is None: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid))) + "\n") affy_eq[probe_id] = uid else: uid = status fp.write(delim.join((probe_id, str(uid))) + "\n") affy_eq[probe_id] = uid # we have > 1 entrez mapping, resolve to one. else: adjacent_list = [] for entrez_gene in entrez_ids: refstatus = refseq.get(entrez_gene) adjacent_list.append(ref_status.get(refstatus)) # zipping yields a list of tuples like [('5307',0), ('104',2), ('3043',None)] # i.e. [(entrez_id, refseq_status)] list_of_tuples = list(zip(entrez_ids, adjacent_list)) # get rid of all 'None' tuples (No entrez mapping) list_of_tuples = [tup for tup in list_of_tuples if tup[1] is not None] # no mapping, generate new uuid if len(list_of_tuples) == 0: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid))) + "\n") affy_eq[probe_id] = uid # multiple entrez, resolve by refseq status else: # min tuple is highest refseq status (0 the best) min_tuple = min(list_of_tuples, key=lambda x: x[1]) min_refseq = min_tuple[1] lowest_tuples = [] for item in list_of_tuples: if item[1] == min_refseq: lowest_tuples.append(item) # if mutiple genes with same refseq, resolve by lowest gene # target_tuple = min(lowest_tuples) uid = entrez_eq.get(target_tuple[0]) fp.write(delim.join((probe_id, str(uid))) + "\n") affy_eq[probe_id] = uid # no entrez mapping, create a new uuid else: uid = uuid.uuid4() fp.write(delim.join((probe_id, str(uid))) + "\n") affy_eq[probe_id] = uid # equiv for alt ids and names relies on the equivalence for # primary ids being completely generated. elif str(d) == "chebi": with open("chebi-ids.beleq", "w") as fp, open("chebi-names_eq.beleq", "w") as f: # like Entrez, new uuid for primary ids only the FIRST time. for primary_id in d.get_primary_ids(): uid = uuid.uuid4() fp.write(delim.join((primary_id, str(uid))) + "\n") chebi_id_eq[primary_id] = uid for alt_id in d.get_alt_ids(): if alt_id not in chebi_id_eq: # get its primary equivalent and use its uuid primary = d.alt_to_primary(alt_id) uid = chebi_id_eq[primary] fp.write(delim.join((alt_id, str(uid))) + "\n") chebi_id_eq[alt_id] = uid for name in d.get_names(): primary = d.name_to_primary(name) uid = chebi_id_eq.get(primary) f.write(delim.join((name, str(uid))) + "\n") chebi_name_eq[name] = uid elif str(d) == "pubchem_equiv": with open("pubchem_eq.beleq", "w") as fp: for sid, source, cid in d.get_eq_values(): if "ChEBI" in source and cid is not None: # <-- verify that NO PubChem CID == 'None' # use the CHEBI uuid chebi_equiv = source.split(":")[1] uid = chebi_id_eq.get(chebi_equiv) fp.write(delim.join((sid, str(uid))) + "\n") pub_eq_dict[sid] = uid else: # generate a new uuid uid = uuid.uuid4() fp.write(delim.join((sid, str(uid))) + "\n") elif str(d) == "gobp": with open("go-biological-processes-names.beleq", "w") as gobp, open( "go-biological-processes-ids.beleq", "w" ) as gobp_id: for vals in d.get_eq_values(): termid, termname = vals uid = uuid.uuid4() gobp_id.write(delim.join((termid, str(uid))) + "\n") gobp.write(delim.join((termname, str(uid))) + "\n") gobp_eq_dict[termname] = uid # GO is the baseline for processes, so new uuids the first time. elif str(d) == "gocc": with open("go-cellular-component-terms.beleq", "w") as gocc, open( "go-cellular-component-ids.beleq", "w" ) as gocc_id: for vals in d.get_eq_values(): termid, termname = vals uid = uuid.uuid4() gocc_id.write(delim.join((termid, str(uid))) + "\n") gocc.write(delim.join((termname, str(uid))) + "\n") gocc_eq_dict[termid] = uid elif str(d) == "do": # assign DO a new uuid and use as the primary for diseases with open("disease-ontology-names.beleq", "w") as dn, open("disease-ontology-ids.beleq", "w") as di: for vals in d.get_eq_values(): name, id = vals uid = uuid.uuid4() dn.write(delim.join((name, str(uid))) + "\n") di.write(delim.join((id, str(uid))) + "\n") do_eq_dict[name] = uid elif str(d) == "sdis_to_do": # try to resolve sdis terms to DO. If there is not one, # assign a new uuid. count = 0 sdis = parsed.load_data("sdis") with open("selventa-legacy-diseases.beleq", "w") as dof: for vals in sdis.get_eq_values(): uid = None sdis_term = vals if d.has_equivalence(sdis_term): count = count + 1 do_term = d.get_equivalence(sdis_term) if do_term in do_eq_dict: uid = do_eq_dict[do_term] else: uid = do_eq_dict[do_term.lower()] else: uid = uuid.uuid4() dof.write(delim.join((sdis_term, str(uid))) + "\n") if verbose: print("Able to resolve " + str(count) + " legacy disease terms to DO.") elif str(d) == "schem_to_chebi": # try to resolve schem terms to CHEBI. If there is not one, # assign a new uuid. count = 0 schem = parsed.load_data("schem") with open("selventa-legacy-chemical-names.beleq", "w") as schemf: for vals in schem.get_eq_values(): uid = None schem_term = vals if d.has_equivalence(schem_term): count = count + 1 chebi_term = d.get_equivalence(schem_term) if chebi_term in chebi_name_eq: uid = chebi_name_eq[chebi_term] elif chebi_term.lower() in chebi_name_eq: uid = chebi_name_eq[chebi_term.lower()] else: uid = uuid.uuid4() schemf.write(delim.join((schem_term, str(uid))) + "\n") if verbose: print("Able to resolve " + str(count) + " legacy chemical terms to CHEBI.") elif str(d) == "mesh": with open("mesh-cellular-locations.beleq", "w") as mesha, open("mesh-diseases.beleq", "w") as meshc, open( "mesh-biological-processes.beleq", "w" ) as meshg: do_data = parsed.load_data("do") for vals in d.get_eq_values(): ui, mh, mns, synonyms = vals if any("A11.284" in branch for branch in mns): # get GO equiv if there is one uid = None go_id = mg_eq.get(mh) # meshcs_to_gocc contains OBSOLETE GO terms at the moment. # It is possible this lookup will return None, in that # case generate a new uuid. if go_id is not None: uid = gocc_eq_dict.get(go_id) # try to find out why lookups fail - maybe OBSOLETE? if uid is None: if verbose: print("Lookup failed for: " + str(go_id)) uid = uuid.uuid4() else: uid = uuid.uuid4() mesha.write(delim.join((mh, str(uid))) + "\n") elif any("C" in branch for branch in mns): # does UI exist as a Xref in DO? xref = do_data.get_xrefs("MSH:" + ui) if xref: uid = do_eq_dict[xref] else: uid = uuid.uuid4() meshc.write(delim.join((mh, str(uid))) + "\n") elif any("G" in branch for branch in mns): # synonyms for MeSH uid = None for syn in synonyms: # root 'G' branch in GOBP for name in gobp_eq_dict: if syn.lower() == name.lower(): uid = gobp_eq_dict.get(name) if uid is None: uid = uuid.uuid4() meshg.write(delim.join((mh, str(uid))) + "\n")
def make_namespace(d, verbose): # build and write out the namespace values for each dataset delim = '|' if str(d) == 'entrez_info': with open('entrez-gene-ids.belns', 'w') as fp: # tuple of (gene_id, gene_type, description) for vals in d.get_ns_values(): gene_id, gene_type, description = vals if gene_type == 'miscRNA': if 'microRNA' in description: fp.write(delim.join((gene_id, 'GRM')) + '\n') entrez_ns.add(gene_id) else: fp.write(delim.join((gene_id, 'GR')) + '\n') entrez_ns.add(gene_id) else: fp.write( delim.join((gene_id, entrez_encoding[gene_type])) + '\n') entrez_ns.add(gene_id) elif str(d) == 'hgnc': with open('hgnc-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): approved_symb, locus_type, hgnc_id = vals # withdrawn genes NOT included in this namespace if locus_type is not 'withdrawn' and 'withdrawn' not in approved_symb: fp.write( delim.join((approved_symb, hgnc_encoding[locus_type])) + '\n') hgnc_ns.add(approved_symb) hgnc_map[hgnc_id] = approved_symb elif str(d) == 'mgi': with open('mgi-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): marker_symbol, feature_type, acc_id, marker_type = vals if marker_type == 'Gene' or marker_type == 'Pseudogene': fp.write( delim.join((marker_symbol, mgi_encoding[feature_type])) + '\n') mgi_ns.add(marker_symbol) mgi_map[acc_id] = marker_symbol # withdrawn genes are NOT included in this namespace elif str(d) == 'rgd': with open('rgd-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): symbol, gene_type, name, rgd_id = vals if gene_type == 'miscrna' and 'microRNA' in name: fp.write(delim.join((symbol, 'GRM')) + '\n') rgd_ns.add(symbol) elif gene_type == 'miscrna' and 'microRNA' not in name: fp.write(delim.join((symbol, 'GR')) + '\n') rgd_ns.add(symbol) else: if gene_type is not '': fp.write( delim.join((symbol, rgd_encoding[gene_type])) + '\n') rgd_ns.add(symbol) rgd_map[rgd_id] = symbol elif str(d) == 'swiss': with open('swissprot-entry-names.belns', 'w') as fp, \ open('swissprot-accession-numbers.belns', 'w') as f: for vals in d.get_ns_values(): gene_name, accessions = vals fp.write(delim.join((gene_name, 'GRP')) + '\n') sp_ns.add(gene_name) for acc in accessions: f.write(delim.join((acc, 'GRP')) + '\n') sp_acc_ns.add(acc) elif str(d) == 'affy': with open('affy-probeset-ids.belns', 'w') as fp: for vals in d.get_ns_values(): pid = vals fp.write(delim.join((pid, 'R')) + '\n') # if pid not in affy_ns_dict: # affy_ns_dict[pid] = 'R' elif str(d) == 'chebi': with open('chebi-names.belns', 'w') as fp, \ open('chebi-ids.belns', 'w') as f: for vals in d.get_ns_values(): name, primary_id, altIds = vals fp.write(delim.join((name, 'A')) + '\n') chebi_name_ns.add(name) f.write(delim.join((primary_id, 'A')) + '\n') chebi_id_ns.add(name) if altIds: for i in altIds: if i not in chebi_id_ns: f.write(delim.join((i, 'A')) + '\n') chebi_id_ns.add(i) elif str(d) == 'pubchem_namespace': with open('pubchem-ids.belns', 'w') as fp: for vals in d.get_ns_values(): pid = vals fp.write(delim.join((pid, 'A')) + '\n') pub_ns.add(pid) elif str(d) == 'gobp': with open('go-biological-processes-names.belns', 'w') as gobp, \ open('go-biological-processes-accession-numbers.belns', 'w') \ as gobp_id: for vals in d.get_ns_values(): termid, termname, altids = vals gobp.write(delim.join((termname, 'B')) + '\n') gobp_id.write(delim.join((termid, 'B')) + '\n') if altids is not None: for alt in altids: gobp_id.write(delim.join((alt, 'B')) + '\n') elif str(d) == 'gocc': with open('go-cellular-component-terms.belns', 'w') as gocc, \ open('go-cellular-component-accession-numbers.belns', 'w') \ as gocc_id: for vals in d.get_ns_values(): termid, termname, altids, complex = vals if complex: gocc.write(delim.join((termname, 'C')) + '\n') gocc_id.write(delim.join((termid, 'C')) + '\n') for alt in altids: gocc_id.write(delim.join((alt, 'C')) + '\n') else: gocc.write(delim.join((termname, 'A')) + '\n') gocc_id.write(delim.join((termid, 'A')) + '\n') if altids is not None: for alt in altids: gocc_id.write(delim.join((alt, 'A')) + '\n') elif str(d) == 'schem': schem_to_chebi = parsed.load_data('schem_to_chebi') count = 0 with open('selventa-legacy-chemical-names.belns', 'w') as f: for entry in d.get_ns_values(): # try to get a chebi equivalent, if there is one do not # write this value to the new namespace if schem_to_chebi.has_equivalence(entry): count = count + 1 continue else: f.write(delim.join((entry, 'A')) + '\n') if verbose: print('Able to resolve ' + str(count) + ' SCHEM names to CHEBI.') elif str(d) == 'sdis': sdis_to_do = parsed.load_data('sdis_to_do') count = 0 with open('selventa-legacy-diseases.belns', 'w') as f: for entry in d.get_ns_values(): # try to get a do equivalent, if there is one do not # write this value to the new namespace if sdis_to_do.has_equivalence(entry): count = count + 1 continue else: f.write(delim.join((entry, 'O')) + '\n') if verbose: print('Able to resolve ' + str(count) + ' SDIS names to DO.') elif str(d) == 'mesh': with open('mesh-cellular-locations.belns', 'w') as meshf, \ open('mesh-diseases.belns', 'w') as meshd, \ open('mesh-biological-processes.belns', 'w') as meshb: for vals in d.get_ns_values(): ui, mh, mns, sts = vals # all entries from A11.284 branch (abundances) if any('A11.284' in branch for branch in mns): meshf.write(delim.join((mh, 'A')) + '\n') # all entries from the C branch (pathology) elif any('C' in branch for branch in mns): meshd.write(delim.join((mh, 'O')) + '\n') # G branch (bio process) - exclude G01 G02 G15 G17 branches elif any('G' in branch for branch in mns): excluded = False for branch in mns: if branch.startswith('MN = G01') \ or branch.startswith('MN = G02') \ or branch.startswith('MN = G15') \ or branch.startswith('MN = G17'): excluded = True if not excluded: meshb.write(delim.join((mh, 'B')) + '\n') elif str(d) == 'do': with open('disease-ontology-names.belns', 'w') as dn, \ open('disease-ontology-ids.belns', 'w') as di: for vals in d.get_ns_values(): name, id = vals dn.write(delim.join((name, 'O')) + '\n') di.write(delim.join((id, 'O')) + '\n')
def make_namespace(d, verbose): # build and write out the namespace values for each dataset delim = '|' if str(d) == 'entrez_info': with open('entrez-gene-ids.belns', 'w') as fp: # tuple of (gene_id, gene_type, description) for vals in d.get_ns_values(): gene_id, gene_type, description = vals if gene_type == 'miscRNA': if 'microRNA' in description: fp.write(delim.join((gene_id, 'GRM'))+'\n') entrez_ns.add(gene_id) else: fp.write(delim.join((gene_id, 'GR'))+'\n') entrez_ns.add(gene_id) else: fp.write(delim.join((gene_id, entrez_encoding[gene_type]))+'\n') entrez_ns.add(gene_id) elif str(d) == 'hgnc': with open('hgnc-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): approved_symb, locus_type, hgnc_id = vals # withdrawn genes NOT included in this namespace if locus_type is not 'withdrawn' and 'withdrawn' not in approved_symb: fp.write(delim.join((approved_symb, hgnc_encoding[locus_type]))+'\n') hgnc_ns.add(approved_symb) hgnc_map[hgnc_id] = approved_symb elif str(d) == 'mgi': with open('mgi-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): marker_symbol, feature_type, acc_id, marker_type = vals if marker_type == 'Gene' or marker_type == 'Pseudogene': fp.write(delim.join((marker_symbol, mgi_encoding[feature_type]))+'\n') mgi_ns.add(marker_symbol) mgi_map[acc_id] = marker_symbol # withdrawn genes are NOT included in this namespace elif str(d) == 'rgd': with open('rgd-approved-symbols.belns', 'w') as fp: for vals in d.get_ns_values(): symbol, gene_type, name, rgd_id = vals if gene_type == 'miscrna' and 'microRNA' in name: fp.write(delim.join((symbol, 'GRM'))+'\n') rgd_ns.add(symbol) elif gene_type == 'miscrna' and 'microRNA' not in name: fp.write(delim.join((symbol, 'GR'))+'\n') rgd_ns.add(symbol) else: if gene_type is not '': fp.write(delim.join((symbol, rgd_encoding[gene_type]))+'\n') rgd_ns.add(symbol) rgd_map[rgd_id] = symbol elif str(d) == 'swiss': with open('swissprot-entry-names.belns', 'w') as fp, \ open('swissprot-accession-numbers.belns', 'w') as f: for vals in d.get_ns_values(): gene_name, accessions = vals fp.write(delim.join((gene_name, 'GRP'))+'\n') sp_ns.add(gene_name) for acc in accessions: f.write(delim.join((acc, 'GRP'))+'\n') sp_acc_ns.add(acc) elif str(d) == 'affy': with open('affy-probeset-ids.belns', 'w') as fp: for vals in d.get_ns_values(): pid = vals fp.write(delim.join((pid, 'R'))+'\n') # if pid not in affy_ns_dict: # affy_ns_dict[pid] = 'R' elif str(d) == 'chebi': with open('chebi-names.belns', 'w') as fp, \ open('chebi-ids.belns', 'w') as f: for vals in d.get_ns_values(): name, primary_id, altIds = vals fp.write(delim.join((name, 'A'))+'\n') chebi_name_ns.add(name) f.write(delim.join((primary_id, 'A'))+'\n') chebi_id_ns.add(name) if altIds: for i in altIds: if i not in chebi_id_ns: f.write(delim.join((i, 'A'))+'\n') chebi_id_ns.add(i) elif str(d) == 'pubchem_namespace': with open('pubchem-ids.belns', 'w') as fp: for vals in d.get_ns_values(): pid = vals fp.write(delim.join((pid, 'A'))+'\n') pub_ns.add(pid) elif str(d) == 'gobp': with open('go-biological-processes-names.belns', 'w') as gobp, \ open('go-biological-processes-accession-numbers.belns', 'w') \ as gobp_id: for vals in d.get_ns_values(): termid, termname, altids = vals gobp.write(delim.join((termname, 'B'))+'\n') gobp_id.write(delim.join((termid, 'B'))+'\n') if altids is not None: for alt in altids: gobp_id.write(delim.join((alt, 'B'))+'\n') elif str(d) == 'gocc': with open('go-cellular-component-terms.belns', 'w') as gocc, \ open('go-cellular-component-accession-numbers.belns', 'w') \ as gocc_id: for vals in d.get_ns_values(): termid, termname, altids, complex = vals if complex: gocc.write(delim.join((termname, 'C'))+'\n') gocc_id.write(delim.join((termid, 'C'))+'\n') for alt in altids: gocc_id.write(delim.join((alt, 'C'))+'\n') else: gocc.write(delim.join((termname, 'A'))+'\n') gocc_id.write(delim.join((termid, 'A'))+'\n') if altids is not None: for alt in altids: gocc_id.write(delim.join((alt, 'A'))+'\n') elif str(d) == 'schem': schem_to_chebi = parsed.load_data('schem_to_chebi') count = 0 with open('selventa-legacy-chemical-names.belns', 'w') as f: for entry in d.get_ns_values(): # try to get a chebi equivalent, if there is one do not # write this value to the new namespace if schem_to_chebi.has_equivalence(entry): count = count + 1 continue else: f.write(delim.join((entry, 'A'))+'\n') if verbose: print('Able to resolve ' +str(count)+ ' SCHEM names to CHEBI.') elif str(d) == 'sdis': sdis_to_do = parsed.load_data('sdis_to_do') count = 0 with open('selventa-legacy-diseases.belns', 'w') as f: for entry in d.get_ns_values(): # try to get a do equivalent, if there is one do not # write this value to the new namespace if sdis_to_do.has_equivalence(entry): count = count + 1 continue else: f.write(delim.join((entry, 'O'))+'\n') if verbose: print('Able to resolve ' +str(count)+ ' SDIS names to DO.') elif str(d) == 'mesh': with open('mesh-cellular-locations.belns', 'w') as meshf, \ open('mesh-diseases.belns', 'w') as meshd, \ open('mesh-biological-processes.belns', 'w') as meshb: for vals in d.get_ns_values(): ui, mh, mns, sts = vals # all entries from A11.284 branch (abundances) if any('A11.284' in branch for branch in mns): meshf.write(delim.join((mh, 'A'))+'\n') # all entries from the C branch (pathology) elif any('C' in branch for branch in mns): meshd.write(delim.join((mh, 'O'))+'\n') # G branch (bio process) - exclude G01 G02 G15 G17 branches elif any('G' in branch for branch in mns): excluded = False for branch in mns: if branch.startswith('MN = G01') \ or branch.startswith('MN = G02') \ or branch.startswith('MN = G15') \ or branch.startswith('MN = G17'): excluded = True if not excluded: meshb.write(delim.join((mh, 'B'))+'\n') elif str(d) == 'do': with open('disease-ontology-names.belns', 'w') as dn, \ open('disease-ontology-ids.belns', 'w') as di: for vals in d.get_ns_values(): name, id = vals dn.write(delim.join((name, 'O'))+'\n') di.write(delim.join((id, 'O'))+'\n')