Пример #1
0
def equiv(d, verbose):
    if str(d) == 'entrez_info':
        with open('entrez-gene-ids.beleq', 'w') as fp:
            for gene_id in d.get_eq_values():
                uid = uuid.uuid4()
                fp.write(delim.join((gene_id, str(uid)))+'\n')
                entrez_eq[gene_id] = uuid.uuid4()
        make_eq_dict(d)

    elif str(d) == 'hgnc':
        with open('hgnc-approved-symbols.beleq', 'w') as fp:
            for approved_symbol in d.get_eq_values():
                if '~withdrawn' in approved_symbol:
                    continue
                new_id = to_entrez('HGNC:'+approved_symbol)
                if new_id is None:
                    # keep track of which hgnc genes need new uuids (dont map to entrez)
                    hgnc_list.append(approved_symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((approved_symbol, str(uid)))+'\n')
                    hgnc_eq[approved_symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((approved_symbol, str(uid)))+'\n')
                    hgnc_eq[approved_symbol] = uid

    elif str(d) == 'mgi':
        with open('mgi-approved-symbols.beleq', 'w') as fp:
            for marker_symbol in d.get_eq_values():
                new_id = to_entrez('MGI:'+marker_symbol)
                if new_id is None:
                    # keep track of which genes need new uuids (dont map to entrez)
                    mgi_list.append(marker_symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((marker_symbol, str(uid)))+'\n')
                    mgi_eq[marker_symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((marker_symbol, str(uid)))+'\n')
                    mgi_eq[marker_symbol] = uid

    elif str(d) == 'rgd':
        with open('rgd-approved-symbols.beleq', 'w') as fp:
            for symbol in d.get_eq_values():
                new_id = to_entrez('RGD:'+symbol)
                if new_id is None:
                    # keep track of which genes need new uuids (dont map to entrez)
                    rgd_list.append(symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((symbol, str(uid)))+'\n')
                    rgd_eq[symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((symbol, str(uid)))+'\n')
                    rgd_eq[symbol] = uid

    elif str(d) == 'swiss':
        with open('swissprot-entry-names.beleq', 'w') as fp:
            # dbrefs is a dict, i.e { reference_type : id_of_that_gene}
            for name, dbrefs, accessions in d.get_eq_values():
                target_pool = ['HGNC', 'MGI', 'RGD']
                gene_ids = []
                alt_ids = []

                if dbrefs is not None:
                    for k, v in dbrefs.items():
                        if k == 'GeneId':
                            gene_ids.extend(v)
                        if k in target_pool:
                            # could be MGI or RGD or HGNC ids
                            alt_ids.extend(v)
                if len(gene_ids) == 1:
                    temp_id = entrez_eq.get(gene_ids[0])
                    if temp_id is None:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid)))+'\n')
                        sp_eq[name] = uid
                    else:
                        uid = entrez_eq.get(gene_ids[0])
                        fp.write(delim.join((name, str(uid)))+'\n')
                        sp_eq[name] = uid
                elif len(gene_ids) == 0:
                    # are there hgnc, mgi or rgd refs?
                    if len(alt_ids) == 0:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid)))+'\n')
                        sp_eq[name] = uid
                        sp_list.append(name)
                    elif len(alt_ids) == 1:
                        a_id = alt_ids[0]
                        if 'HGNC' in a_id:
                            hgnc_key = namespaces.hgnc_map.get(a_id)
                            uid = hgnc_eq.get(hgnc_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid)))+'\n')
                                sp_eq[name] = uid
                            else:
                                sp_eq[name] = uid
                        elif 'MGI' in a_id:
                            mgi_key = namespaces.mgi_map.get(a_id)
                            uid = mgi_eq.get(mgi_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid)))+'\n')
                                sp_eq[name] = uid
                            else:
                                sp_eq[name] = uid
                        else:
                            rgd_key = namespaces.rgd_map.get(a_id)
                            uid = rgd_eq.get(rgd_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid)))+'\n')
                                sp_eq[name] = uid
                            else:
                                fp.write(delim.join((name, str(uid)))+'\n')
                                sp_eq[name] = uid
                    # > 1 alt_id then generate a new uuid
                    else:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid)))+'\n')
                        sp_eq[name] = uid
                # > 1 entrez id than generate a new uuid
                else:
                    uid = uuid.uuid4()
                    fp.write(delim.join((name, str(uid)))+'\n')
                    sp_eq[name] = uid
                # finally, generate .beleq for accession data also
                build_acc_data(accessions, name)
            swiss_accessions_eq()

    elif str(d) == 'affy':
        with open('affy-probeset-ids.beleq', 'w') as fp:
            for probe_id, gene_id in d.get_eq_values():

                if gene_id is not None and '---' not in gene_id:

                    # need the space before and after '///' because that is how it is parsed.
                    entrez_ids = gene_id.split(' /// ')

                    # for 1 entrez mapping, use the entez uuid
                    if len(entrez_ids) == 1:
                        status = entrez_eq.get(entrez_ids[0])
                        if status is None:
                            uid = uuid.uuid4()
                            fp.write(delim.join((probe_id, str(uid)))+'\n')
                            affy_eq[probe_id] = uid
                        else:
                            uid = status
                            fp.write(delim.join((probe_id, str(uid)))+'\n')
                            affy_eq[probe_id] = uid
                    # we have > 1 entrez mapping, resolve to one.
                    else:
                        adjacent_list = []
                        for entrez_gene in entrez_ids:
                            refstatus = refseq.get(entrez_gene)
                            adjacent_list.append(ref_status.get(refstatus))

                        # zipping yields a list of tuples like [('5307',0), ('104',2), ('3043',None)]
                        # i.e. [(entrez_id, refseq_status)]
                        list_of_tuples = list(zip(entrez_ids, adjacent_list))

                        # get rid of all 'None' tuples (No entrez mapping)
                        list_of_tuples = [tup for tup in list_of_tuples if tup[1] is not None]

                        # no mapping, generate new uuid
                        if len(list_of_tuples) == 0:
                            uid = uuid.uuid4()
                            fp.write(delim.join((probe_id, str(uid)))+'\n')
                            affy_eq[probe_id] = uid

                        # multiple entrez, resolve by refseq status
                        else:
                            # min tuple is highest refseq status (0 the best)
                            min_tuple = min(list_of_tuples, key=lambda x: x[1])
                            min_refseq = min_tuple[1]
                            lowest_tuples = []

                            for item in list_of_tuples:
                                if item[1] == min_refseq:
                                    lowest_tuples.append(item)

                            # if mutiple genes with same refseq, resolve by lowest gene #
                            target_tuple = min(lowest_tuples)
                            uid = entrez_eq.get(target_tuple[0])
                            fp.write(delim.join((probe_id, str(uid)))+'\n')
                            affy_eq[probe_id] = uid
                # no entrez mapping, create a new uuid
                else:
                    uid = uuid.uuid4()
                    fp.write(delim.join((probe_id, str(uid)))+'\n')
                    affy_eq[probe_id] = uid

    # equiv for alt ids and names relies on the equivalence for
    # primary ids being completely generated.
    elif str(d) == 'chebi':
        with open('chebi-ids.beleq', 'w') as fp, open('chebi-names_eq.beleq', 'w') as f:
            # like Entrez, new uuid for primary ids only the FIRST time.
            for primary_id in d.get_primary_ids():
                uid = uuid.uuid4()
                fp.write(delim.join((primary_id, str(uid)))+'\n')
                chebi_id_eq[primary_id] = uid
            for alt_id in d.get_alt_ids():
                if alt_id not in chebi_id_eq:
                    # get its primary equivalent and use its uuid
                    primary = d.alt_to_primary(alt_id)
                    uid = chebi_id_eq[primary]
                    fp.write(delim.join((alt_id, str(uid)))+'\n')
                    chebi_id_eq[alt_id] = uid
            for name in d.get_names():
                primary = d.name_to_primary(name)
                uid = chebi_id_eq.get(primary)
                f.write(delim.join((name, str(uid)))+'\n')
                chebi_name_eq[name] = uid

    elif str(d) == 'pubchem_equiv':
        with open('pubchem_eq.beleq', 'w') as fp:
            for sid, source, cid in d.get_eq_values():
                if 'ChEBI' in source and cid is not None:  # <-- verify that NO PubChem CID == 'None'
                    # use the CHEBI uuid
                    chebi_equiv = source.split(':')[1]
                    uid = chebi_id_eq.get(chebi_equiv)
                    fp.write(delim.join((sid, str(uid)))+'\n')
                    pub_eq_dict[sid] = uid
                else:
                    # generate a new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((sid, str(uid)))+'\n')

    elif str(d) == 'gobp':
        with open('go-biological-processes-names.beleq', 'w') as gobp, \
                open('go-biological-processes-ids.beleq', 'w') as gobp_id:
            for vals in d.get_eq_values():
                termid, termname = vals
                uid = uuid.uuid4()
                gobp_id.write(delim.join((termid, str(uid)))+'\n')
                gobp.write(delim.join((termname, str(uid)))+'\n')
                gobp_eq_dict[termname] = uid

    # GO is the baseline for processes, so new uuids the first time.
    elif str(d) == 'gocc':

        with open('go-cellular-component-terms.beleq', 'w') as gocc, \
                open('go-cellular-component-ids.beleq', 'w') as gocc_id:

            for vals in d.get_eq_values():
                termid, termname = vals
                uid = uuid.uuid4()
                gocc_id.write(delim.join((termid, str(uid)))+'\n')
                gocc.write(delim.join((termname, str(uid)))+'\n')
                gocc_eq_dict[termid] = uid

    elif str(d) == 'do':
        # assign DO a new uuid and use as the primary for diseases
        with open('disease-ontology-names.beleq', 'w') as dn, \
                open('disease-ontology-ids.beleq', 'w') as di:
            for vals in d.get_eq_values():
                name, id = vals
                uid = uuid.uuid4()
                dn.write(delim.join((name, str(uid)))+'\n')
                di.write(delim.join((id, str(uid)))+'\n')
                do_eq_dict[name] = uid

    elif str(d) == 'sdis_to_do':
        # try to resolve sdis terms to DO. If there is not one,
        # assign a new uuid.
        count = 0
        sdis = parsed.load_data('sdis')
        with open('selventa-legacy-diseases.beleq', 'w') as dof:
            for vals in sdis.get_eq_values():
                uid = None
                sdis_term = vals
                if d.has_equivalence(sdis_term):
                    count = count + 1
                    do_term = d.get_equivalence(sdis_term)
                    if do_term in do_eq_dict:
                        uid = do_eq_dict[do_term]
                    else:
                        uid = do_eq_dict[do_term.lower()]
                else:
                    uid = uuid.uuid4()
                dof.write(delim.join((sdis_term, str(uid)))+'\n')
        if verbose:
            print('Able to resolve ' +str(count)+ ' legacy disease terms to DO.')

    elif str(d) == 'schem_to_chebi':
        # try to resolve schem terms to CHEBI. If there is not one,
        # assign a new uuid.
        count = 0
        schem = parsed.load_data('schem')
        with open('selventa-legacy-chemical-names.beleq', 'w') as schemf:
            for vals in schem.get_eq_values():
                uid = None
                schem_term = vals
                if d.has_equivalence(schem_term):
                    count = count + 1
                    chebi_term = d.get_equivalence(schem_term)
                    if chebi_term in chebi_name_eq:
                        uid = chebi_name_eq[chebi_term]
                    elif chebi_term.lower() in chebi_name_eq:
                        uid = chebi_name_eq[chebi_term.lower()]
                else:
                    uid = uuid.uuid4()
                schemf.write(delim.join((schem_term, str(uid)))+'\n')
        if verbose:
            print('Able to resolve ' +str(count)+ ' legacy chemical terms to CHEBI.')

    elif str(d) == 'mesh':

        with open('mesh-cellular-locations.beleq', 'w') as mesha, \
                open('mesh-diseases.beleq', 'w') as meshc, \
                open('mesh-biological-processes.beleq', 'w') as meshg:
            do_data = parsed.load_data('do')
            for vals in d.get_eq_values():
                ui, mh, mns, synonyms = vals
                if any('A11.284' in branch for branch in mns):
                    # get GO equiv if there is one
                    uid = None
                    go_id = mg_eq.get(mh)
                    # meshcs_to_gocc contains OBSOLETE GO terms at the moment.
                    # It is possible this lookup will return None, in that
                    # case generate a new uuid.
                    if go_id is not None:
                        uid = gocc_eq_dict.get(go_id)
                        # try to find out why lookups fail - maybe OBSOLETE?
                        if uid is None:
                            if verbose:
                                print('Lookup failed for: '+str(go_id))
                            uid = uuid.uuid4()
                    else:
                        uid = uuid.uuid4()
                    mesha.write(delim.join((mh, str(uid)))+'\n')
                elif any('C' in branch for branch in mns):
                    # does UI exist as a Xref in DO?
                    xref = do_data.get_xrefs('MSH:'+ui)
                    if xref:
                        uid = do_eq_dict[xref]
                    else:
                        uid = uuid.uuid4()
                    meshc.write(delim.join((mh, str(uid)))+'\n')
                elif any('G' in branch for branch in mns):
                    # synonyms for MeSH
                    uid = None
                    for syn in synonyms:
                        # root 'G' branch in GOBP
                        for name in gobp_eq_dict:
                            if syn.lower() == name.lower():
                                uid = gobp_eq_dict.get(name)
                    if uid is None:
                        uid = uuid.uuid4()
                    meshg.write(delim.join((mh, str(uid)))+'\n')
Пример #2
0
for root, dirs, filenames in os.walk(working_dir):
    for f in filenames:
        if f in baseline_data:
            data_tuple = baseline_data.get(f)
            parser = data_tuple[PARSER_TYPE]("datasets/" + f)
            if verbose:
                parser.is_verbose()
                print("Running " + str(parser))
            for x in parser.parse():
                parsed.build_data(x, str(parser))
print("Phase II ran in " + str(((time.time() - interval_time) / 60)) + " minutes")

print("\n======= Phase III, building namespaces =======")
interval_time = time.time()
# load parsed data to build namespaces
ei = parsed.load_data("entrez_info")
eh = parsed.load_data("entrez_history")
hg = parsed.load_data("hgnc")
mg = parsed.load_data("mgi")
rg = parsed.load_data("rgd")
sp = parsed.load_data("swiss")
af = parsed.load_data("affy")
g2 = parsed.load_data("gene2acc")
chebi = parsed.load_data("chebi")
schem = parsed.load_data("schem")
schem_to_chebi = parsed.load_data("schem_to_chebi")
sdis = parsed.load_data("sdis")
sdis_to_do = parsed.load_data("sdis_to_do")
gobp = parsed.load_data("gobp")
gocc = parsed.load_data("gocc")
# pub_eq = parsed.load_data('pubchem_equiv')
Пример #3
0
def equiv(d, verbose):
    if str(d) == "entrez_info":
        with open("entrez-gene-ids.beleq", "w") as fp:
            for gene_id in d.get_eq_values():
                uid = uuid.uuid4()
                fp.write(delim.join((gene_id, str(uid))) + "\n")
                entrez_eq[gene_id] = uuid.uuid4()
        make_eq_dict(d)

    elif str(d) == "hgnc":
        with open("hgnc-approved-symbols.beleq", "w") as fp:
            for approved_symbol in d.get_eq_values():
                if "~withdrawn" in approved_symbol:
                    continue
                new_id = to_entrez("HGNC:" + approved_symbol)
                if new_id is None:
                    # keep track of which hgnc genes need new uuids (dont map to entrez)
                    hgnc_list.append(approved_symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((approved_symbol, str(uid))) + "\n")
                    hgnc_eq[approved_symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((approved_symbol, str(uid))) + "\n")
                    hgnc_eq[approved_symbol] = uid

    elif str(d) == "mgi":
        with open("mgi-approved-symbols.beleq", "w") as fp:
            for marker_symbol in d.get_eq_values():
                new_id = to_entrez("MGI:" + marker_symbol)
                if new_id is None:
                    # keep track of which genes need new uuids (dont map to entrez)
                    mgi_list.append(marker_symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((marker_symbol, str(uid))) + "\n")
                    mgi_eq[marker_symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((marker_symbol, str(uid))) + "\n")
                    mgi_eq[marker_symbol] = uid

    elif str(d) == "rgd":
        with open("rgd-approved-symbols.beleq", "w") as fp:
            for symbol in d.get_eq_values():
                new_id = to_entrez("RGD:" + symbol)
                if new_id is None:
                    # keep track of which genes need new uuids (dont map to entrez)
                    rgd_list.append(symbol)
                    # generate new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((symbol, str(uid))) + "\n")
                    rgd_eq[symbol] = uid
                else:
                    # use the entrez uuid
                    uid = entrez_eq.get(new_id)
                    fp.write(delim.join((symbol, str(uid))) + "\n")
                    rgd_eq[symbol] = uid

    elif str(d) == "swiss":
        with open("swissprot-entry-names.beleq", "w") as fp:
            # dbrefs is a dict, i.e { reference_type : id_of_that_gene}
            for name, dbrefs, accessions in d.get_eq_values():
                target_pool = ["HGNC", "MGI", "RGD"]
                gene_ids = []
                alt_ids = []

                if dbrefs is not None:
                    for k, v in dbrefs.items():
                        if k == "GeneId":
                            gene_ids.extend(v)
                        if k in target_pool:
                            # could be MGI or RGD or HGNC ids
                            alt_ids.extend(v)
                if len(gene_ids) == 1:
                    temp_id = entrez_eq.get(gene_ids[0])
                    if temp_id is None:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid))) + "\n")
                        sp_eq[name] = uid
                    else:
                        uid = entrez_eq.get(gene_ids[0])
                        fp.write(delim.join((name, str(uid))) + "\n")
                        sp_eq[name] = uid
                elif len(gene_ids) == 0:
                    # are there hgnc, mgi or rgd refs?
                    if len(alt_ids) == 0:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid))) + "\n")
                        sp_eq[name] = uid
                        sp_list.append(name)
                    elif len(alt_ids) == 1:
                        a_id = alt_ids[0]
                        if "HGNC" in a_id:
                            hgnc_key = namespaces.hgnc_map.get(a_id)
                            uid = hgnc_eq.get(hgnc_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid))) + "\n")
                                sp_eq[name] = uid
                            else:
                                sp_eq[name] = uid
                        elif "MGI" in a_id:
                            mgi_key = namespaces.mgi_map.get(a_id)
                            uid = mgi_eq.get(mgi_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid))) + "\n")
                                sp_eq[name] = uid
                            else:
                                sp_eq[name] = uid
                        else:
                            rgd_key = namespaces.rgd_map.get(a_id)
                            uid = rgd_eq.get(rgd_key)
                            # SwissProt may be referring to a since-removed gene.
                            if uid is None:
                                uid = uuid.uuid4()
                                fp.write(delim.join((name, str(uid))) + "\n")
                                sp_eq[name] = uid
                            else:
                                fp.write(delim.join((name, str(uid))) + "\n")
                                sp_eq[name] = uid
                    # > 1 alt_id then generate a new uuid
                    else:
                        uid = uuid.uuid4()
                        fp.write(delim.join((name, str(uid))) + "\n")
                        sp_eq[name] = uid
                # > 1 entrez id than generate a new uuid
                else:
                    uid = uuid.uuid4()
                    fp.write(delim.join((name, str(uid))) + "\n")
                    sp_eq[name] = uid
                # finally, generate .beleq for accession data also
                build_acc_data(accessions, name)
            swiss_accessions_eq()

    elif str(d) == "affy":
        with open("affy-probeset-ids.beleq", "w") as fp:
            for probe_id, gene_id in d.get_eq_values():

                if gene_id is not None and "---" not in gene_id:

                    # need the space before and after '///' because that is how it is parsed.
                    entrez_ids = gene_id.split(" /// ")

                    # for 1 entrez mapping, use the entez uuid
                    if len(entrez_ids) == 1:
                        status = entrez_eq.get(entrez_ids[0])
                        if status is None:
                            uid = uuid.uuid4()
                            fp.write(delim.join((probe_id, str(uid))) + "\n")
                            affy_eq[probe_id] = uid
                        else:
                            uid = status
                            fp.write(delim.join((probe_id, str(uid))) + "\n")
                            affy_eq[probe_id] = uid
                    # we have > 1 entrez mapping, resolve to one.
                    else:
                        adjacent_list = []
                        for entrez_gene in entrez_ids:
                            refstatus = refseq.get(entrez_gene)
                            adjacent_list.append(ref_status.get(refstatus))

                        # zipping yields a list of tuples like [('5307',0), ('104',2), ('3043',None)]
                        # i.e. [(entrez_id, refseq_status)]
                        list_of_tuples = list(zip(entrez_ids, adjacent_list))

                        # get rid of all 'None' tuples (No entrez mapping)
                        list_of_tuples = [tup for tup in list_of_tuples if tup[1] is not None]

                        # no mapping, generate new uuid
                        if len(list_of_tuples) == 0:
                            uid = uuid.uuid4()
                            fp.write(delim.join((probe_id, str(uid))) + "\n")
                            affy_eq[probe_id] = uid

                        # multiple entrez, resolve by refseq status
                        else:
                            # min tuple is highest refseq status (0 the best)
                            min_tuple = min(list_of_tuples, key=lambda x: x[1])
                            min_refseq = min_tuple[1]
                            lowest_tuples = []

                            for item in list_of_tuples:
                                if item[1] == min_refseq:
                                    lowest_tuples.append(item)

                            # if mutiple genes with same refseq, resolve by lowest gene #
                            target_tuple = min(lowest_tuples)
                            uid = entrez_eq.get(target_tuple[0])
                            fp.write(delim.join((probe_id, str(uid))) + "\n")
                            affy_eq[probe_id] = uid
                # no entrez mapping, create a new uuid
                else:
                    uid = uuid.uuid4()
                    fp.write(delim.join((probe_id, str(uid))) + "\n")
                    affy_eq[probe_id] = uid

    # equiv for alt ids and names relies on the equivalence for
    # primary ids being completely generated.
    elif str(d) == "chebi":
        with open("chebi-ids.beleq", "w") as fp, open("chebi-names_eq.beleq", "w") as f:
            # like Entrez, new uuid for primary ids only the FIRST time.
            for primary_id in d.get_primary_ids():
                uid = uuid.uuid4()
                fp.write(delim.join((primary_id, str(uid))) + "\n")
                chebi_id_eq[primary_id] = uid
            for alt_id in d.get_alt_ids():
                if alt_id not in chebi_id_eq:
                    # get its primary equivalent and use its uuid
                    primary = d.alt_to_primary(alt_id)
                    uid = chebi_id_eq[primary]
                    fp.write(delim.join((alt_id, str(uid))) + "\n")
                    chebi_id_eq[alt_id] = uid
            for name in d.get_names():
                primary = d.name_to_primary(name)
                uid = chebi_id_eq.get(primary)
                f.write(delim.join((name, str(uid))) + "\n")
                chebi_name_eq[name] = uid

    elif str(d) == "pubchem_equiv":
        with open("pubchem_eq.beleq", "w") as fp:
            for sid, source, cid in d.get_eq_values():
                if "ChEBI" in source and cid is not None:  # <-- verify that NO PubChem CID == 'None'
                    # use the CHEBI uuid
                    chebi_equiv = source.split(":")[1]
                    uid = chebi_id_eq.get(chebi_equiv)
                    fp.write(delim.join((sid, str(uid))) + "\n")
                    pub_eq_dict[sid] = uid
                else:
                    # generate a new uuid
                    uid = uuid.uuid4()
                    fp.write(delim.join((sid, str(uid))) + "\n")

    elif str(d) == "gobp":
        with open("go-biological-processes-names.beleq", "w") as gobp, open(
            "go-biological-processes-ids.beleq", "w"
        ) as gobp_id:
            for vals in d.get_eq_values():
                termid, termname = vals
                uid = uuid.uuid4()
                gobp_id.write(delim.join((termid, str(uid))) + "\n")
                gobp.write(delim.join((termname, str(uid))) + "\n")
                gobp_eq_dict[termname] = uid

    # GO is the baseline for processes, so new uuids the first time.
    elif str(d) == "gocc":

        with open("go-cellular-component-terms.beleq", "w") as gocc, open(
            "go-cellular-component-ids.beleq", "w"
        ) as gocc_id:

            for vals in d.get_eq_values():
                termid, termname = vals
                uid = uuid.uuid4()
                gocc_id.write(delim.join((termid, str(uid))) + "\n")
                gocc.write(delim.join((termname, str(uid))) + "\n")
                gocc_eq_dict[termid] = uid

    elif str(d) == "do":
        # assign DO a new uuid and use as the primary for diseases
        with open("disease-ontology-names.beleq", "w") as dn, open("disease-ontology-ids.beleq", "w") as di:
            for vals in d.get_eq_values():
                name, id = vals
                uid = uuid.uuid4()
                dn.write(delim.join((name, str(uid))) + "\n")
                di.write(delim.join((id, str(uid))) + "\n")
                do_eq_dict[name] = uid

    elif str(d) == "sdis_to_do":
        # try to resolve sdis terms to DO. If there is not one,
        # assign a new uuid.
        count = 0
        sdis = parsed.load_data("sdis")
        with open("selventa-legacy-diseases.beleq", "w") as dof:
            for vals in sdis.get_eq_values():
                uid = None
                sdis_term = vals
                if d.has_equivalence(sdis_term):
                    count = count + 1
                    do_term = d.get_equivalence(sdis_term)
                    if do_term in do_eq_dict:
                        uid = do_eq_dict[do_term]
                    else:
                        uid = do_eq_dict[do_term.lower()]
                else:
                    uid = uuid.uuid4()
                dof.write(delim.join((sdis_term, str(uid))) + "\n")
        if verbose:
            print("Able to resolve " + str(count) + " legacy disease terms to DO.")

    elif str(d) == "schem_to_chebi":
        # try to resolve schem terms to CHEBI. If there is not one,
        # assign a new uuid.
        count = 0
        schem = parsed.load_data("schem")
        with open("selventa-legacy-chemical-names.beleq", "w") as schemf:
            for vals in schem.get_eq_values():
                uid = None
                schem_term = vals
                if d.has_equivalence(schem_term):
                    count = count + 1
                    chebi_term = d.get_equivalence(schem_term)
                    if chebi_term in chebi_name_eq:
                        uid = chebi_name_eq[chebi_term]
                    elif chebi_term.lower() in chebi_name_eq:
                        uid = chebi_name_eq[chebi_term.lower()]
                else:
                    uid = uuid.uuid4()
                schemf.write(delim.join((schem_term, str(uid))) + "\n")
        if verbose:
            print("Able to resolve " + str(count) + " legacy chemical terms to CHEBI.")

    elif str(d) == "mesh":

        with open("mesh-cellular-locations.beleq", "w") as mesha, open("mesh-diseases.beleq", "w") as meshc, open(
            "mesh-biological-processes.beleq", "w"
        ) as meshg:
            do_data = parsed.load_data("do")
            for vals in d.get_eq_values():
                ui, mh, mns, synonyms = vals
                if any("A11.284" in branch for branch in mns):
                    # get GO equiv if there is one
                    uid = None
                    go_id = mg_eq.get(mh)
                    # meshcs_to_gocc contains OBSOLETE GO terms at the moment.
                    # It is possible this lookup will return None, in that
                    # case generate a new uuid.
                    if go_id is not None:
                        uid = gocc_eq_dict.get(go_id)
                        # try to find out why lookups fail - maybe OBSOLETE?
                        if uid is None:
                            if verbose:
                                print("Lookup failed for: " + str(go_id))
                            uid = uuid.uuid4()
                    else:
                        uid = uuid.uuid4()
                    mesha.write(delim.join((mh, str(uid))) + "\n")
                elif any("C" in branch for branch in mns):
                    # does UI exist as a Xref in DO?
                    xref = do_data.get_xrefs("MSH:" + ui)
                    if xref:
                        uid = do_eq_dict[xref]
                    else:
                        uid = uuid.uuid4()
                    meshc.write(delim.join((mh, str(uid))) + "\n")
                elif any("G" in branch for branch in mns):
                    # synonyms for MeSH
                    uid = None
                    for syn in synonyms:
                        # root 'G' branch in GOBP
                        for name in gobp_eq_dict:
                            if syn.lower() == name.lower():
                                uid = gobp_eq_dict.get(name)
                    if uid is None:
                        uid = uuid.uuid4()
                    meshg.write(delim.join((mh, str(uid))) + "\n")
Пример #4
0
def make_namespace(d, verbose):

    # build and write out the namespace values for each dataset

    delim = '|'
    if str(d) == 'entrez_info':
        with open('entrez-gene-ids.belns', 'w') as fp:
            # tuple of (gene_id, gene_type, description)
            for vals in d.get_ns_values():
                gene_id, gene_type, description = vals
                if gene_type == 'miscRNA':
                    if 'microRNA' in description:
                        fp.write(delim.join((gene_id, 'GRM')) + '\n')
                        entrez_ns.add(gene_id)
                    else:
                        fp.write(delim.join((gene_id, 'GR')) + '\n')
                        entrez_ns.add(gene_id)
                else:
                    fp.write(
                        delim.join((gene_id, entrez_encoding[gene_type])) +
                        '\n')
                    entrez_ns.add(gene_id)

    elif str(d) == 'hgnc':
        with open('hgnc-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                approved_symb, locus_type, hgnc_id = vals
                # withdrawn genes NOT included in this namespace
                if locus_type is not 'withdrawn' and 'withdrawn' not in approved_symb:
                    fp.write(
                        delim.join((approved_symb,
                                    hgnc_encoding[locus_type])) + '\n')
                    hgnc_ns.add(approved_symb)
                hgnc_map[hgnc_id] = approved_symb

    elif str(d) == 'mgi':
        with open('mgi-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                marker_symbol, feature_type, acc_id, marker_type = vals
                if marker_type == 'Gene' or marker_type == 'Pseudogene':
                    fp.write(
                        delim.join((marker_symbol,
                                    mgi_encoding[feature_type])) + '\n')
                    mgi_ns.add(marker_symbol)
                mgi_map[acc_id] = marker_symbol

    # withdrawn genes are NOT included in this namespace
    elif str(d) == 'rgd':
        with open('rgd-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                symbol, gene_type, name, rgd_id = vals
                if gene_type == 'miscrna' and 'microRNA' in name:
                    fp.write(delim.join((symbol, 'GRM')) + '\n')
                    rgd_ns.add(symbol)
                elif gene_type == 'miscrna' and 'microRNA' not in name:
                    fp.write(delim.join((symbol, 'GR')) + '\n')
                    rgd_ns.add(symbol)
                else:
                    if gene_type is not '':
                        fp.write(
                            delim.join((symbol, rgd_encoding[gene_type])) +
                            '\n')
                        rgd_ns.add(symbol)
                rgd_map[rgd_id] = symbol

    elif str(d) == 'swiss':
        with open('swissprot-entry-names.belns', 'w') as fp, \
                open('swissprot-accession-numbers.belns', 'w') as f:
            for vals in d.get_ns_values():
                gene_name, accessions = vals
                fp.write(delim.join((gene_name, 'GRP')) + '\n')
                sp_ns.add(gene_name)
                for acc in accessions:
                    f.write(delim.join((acc, 'GRP')) + '\n')
                    sp_acc_ns.add(acc)

    elif str(d) == 'affy':
        with open('affy-probeset-ids.belns', 'w') as fp:
            for vals in d.get_ns_values():
                pid = vals
                fp.write(delim.join((pid, 'R')) + '\n')
#                    if pid not in affy_ns_dict:
#                        affy_ns_dict[pid] = 'R'

    elif str(d) == 'chebi':
        with open('chebi-names.belns', 'w') as fp, \
                open('chebi-ids.belns', 'w') as f:
            for vals in d.get_ns_values():
                name, primary_id, altIds = vals
                fp.write(delim.join((name, 'A')) + '\n')
                chebi_name_ns.add(name)
                f.write(delim.join((primary_id, 'A')) + '\n')
                chebi_id_ns.add(name)
                if altIds:
                    for i in altIds:
                        if i not in chebi_id_ns:
                            f.write(delim.join((i, 'A')) + '\n')
                        chebi_id_ns.add(i)

    elif str(d) == 'pubchem_namespace':
        with open('pubchem-ids.belns', 'w') as fp:
            for vals in d.get_ns_values():
                pid = vals
                fp.write(delim.join((pid, 'A')) + '\n')
                pub_ns.add(pid)

    elif str(d) == 'gobp':
        with open('go-biological-processes-names.belns', 'w') as gobp, \
                open('go-biological-processes-accession-numbers.belns', 'w') \
                as gobp_id:
            for vals in d.get_ns_values():
                termid, termname, altids = vals
                gobp.write(delim.join((termname, 'B')) + '\n')
                gobp_id.write(delim.join((termid, 'B')) + '\n')
                if altids is not None:
                    for alt in altids:
                        gobp_id.write(delim.join((alt, 'B')) + '\n')

    elif str(d) == 'gocc':

        with open('go-cellular-component-terms.belns', 'w') as gocc, \
                open('go-cellular-component-accession-numbers.belns', 'w') \
                as gocc_id:
            for vals in d.get_ns_values():
                termid, termname, altids, complex = vals
                if complex:
                    gocc.write(delim.join((termname, 'C')) + '\n')
                    gocc_id.write(delim.join((termid, 'C')) + '\n')
                    for alt in altids:
                        gocc_id.write(delim.join((alt, 'C')) + '\n')
                else:
                    gocc.write(delim.join((termname, 'A')) + '\n')
                    gocc_id.write(delim.join((termid, 'A')) + '\n')
                    if altids is not None:
                        for alt in altids:
                            gocc_id.write(delim.join((alt, 'A')) + '\n')

    elif str(d) == 'schem':
        schem_to_chebi = parsed.load_data('schem_to_chebi')
        count = 0
        with open('selventa-legacy-chemical-names.belns', 'w') as f:
            for entry in d.get_ns_values():
                # try to get a chebi equivalent, if there is one do not
                # write this value to the new namespace
                if schem_to_chebi.has_equivalence(entry):
                    count = count + 1
                    continue
                else:
                    f.write(delim.join((entry, 'A')) + '\n')
        if verbose:
            print('Able to resolve ' + str(count) + ' SCHEM names to CHEBI.')

    elif str(d) == 'sdis':
        sdis_to_do = parsed.load_data('sdis_to_do')
        count = 0
        with open('selventa-legacy-diseases.belns', 'w') as f:
            for entry in d.get_ns_values():
                # try to get a do equivalent, if there is one do not
                # write this value to the new namespace
                if sdis_to_do.has_equivalence(entry):
                    count = count + 1
                    continue
                else:
                    f.write(delim.join((entry, 'O')) + '\n')
        if verbose:
            print('Able to resolve ' + str(count) + ' SDIS names to DO.')

    elif str(d) == 'mesh':

        with open('mesh-cellular-locations.belns', 'w') as meshf, \
                open('mesh-diseases.belns', 'w') as meshd, \
                open('mesh-biological-processes.belns', 'w') as meshb:
            for vals in d.get_ns_values():
                ui, mh, mns, sts = vals
                # all entries from A11.284 branch (abundances)
                if any('A11.284' in branch for branch in mns):
                    meshf.write(delim.join((mh, 'A')) + '\n')
                # all entries from the C branch (pathology)
                elif any('C' in branch for branch in mns):
                    meshd.write(delim.join((mh, 'O')) + '\n')
                # G branch (bio process) - exclude G01 G02 G15 G17 branches
                elif any('G' in branch for branch in mns):
                    excluded = False
                    for branch in mns:
                        if branch.startswith('MN = G01') \
                                or branch.startswith('MN = G02') \
                                or branch.startswith('MN = G15') \
                                or branch.startswith('MN = G17'):
                            excluded = True
                    if not excluded:
                        meshb.write(delim.join((mh, 'B')) + '\n')

    elif str(d) == 'do':

        with open('disease-ontology-names.belns', 'w') as dn, \
                open('disease-ontology-ids.belns', 'w') as di:
            for vals in d.get_ns_values():
                name, id = vals
                dn.write(delim.join((name, 'O')) + '\n')
                di.write(delim.join((id, 'O')) + '\n')
Пример #5
0
def make_namespace(d, verbose):

    # build and write out the namespace values for each dataset

    delim = '|'
    if str(d) == 'entrez_info':
        with open('entrez-gene-ids.belns', 'w') as fp:
            # tuple of (gene_id, gene_type, description)
            for vals in d.get_ns_values():
                gene_id, gene_type, description = vals
                if gene_type == 'miscRNA':
                    if 'microRNA' in description:
                        fp.write(delim.join((gene_id, 'GRM'))+'\n')
                        entrez_ns.add(gene_id)
                    else:
                        fp.write(delim.join((gene_id, 'GR'))+'\n')
                        entrez_ns.add(gene_id)
                else:
                    fp.write(delim.join((gene_id, entrez_encoding[gene_type]))+'\n')
                    entrez_ns.add(gene_id)

    elif str(d) == 'hgnc':
        with open('hgnc-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                approved_symb, locus_type, hgnc_id = vals
                # withdrawn genes NOT included in this namespace
                if locus_type is not 'withdrawn' and 'withdrawn' not in approved_symb:
                    fp.write(delim.join((approved_symb, hgnc_encoding[locus_type]))+'\n')
                    hgnc_ns.add(approved_symb)
                hgnc_map[hgnc_id] = approved_symb

    elif str(d) == 'mgi':
        with open('mgi-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                marker_symbol, feature_type, acc_id, marker_type = vals
                if marker_type == 'Gene' or marker_type == 'Pseudogene':
                    fp.write(delim.join((marker_symbol, mgi_encoding[feature_type]))+'\n')
                    mgi_ns.add(marker_symbol)
                mgi_map[acc_id] = marker_symbol

    # withdrawn genes are NOT included in this namespace
    elif str(d) == 'rgd':
        with open('rgd-approved-symbols.belns', 'w') as fp:
            for vals in d.get_ns_values():
                symbol, gene_type, name, rgd_id = vals
                if gene_type == 'miscrna' and 'microRNA' in name:
                    fp.write(delim.join((symbol, 'GRM'))+'\n')
                    rgd_ns.add(symbol)
                elif gene_type == 'miscrna' and 'microRNA' not in name:
                    fp.write(delim.join((symbol, 'GR'))+'\n')
                    rgd_ns.add(symbol)
                else:
                    if gene_type is not '':
                        fp.write(delim.join((symbol, rgd_encoding[gene_type]))+'\n')
                        rgd_ns.add(symbol)
                rgd_map[rgd_id] = symbol

    elif str(d) == 'swiss':
        with open('swissprot-entry-names.belns', 'w') as fp, \
                open('swissprot-accession-numbers.belns', 'w') as f:
            for vals in d.get_ns_values():
                gene_name, accessions = vals
                fp.write(delim.join((gene_name, 'GRP'))+'\n')
                sp_ns.add(gene_name)
                for acc in accessions:
                    f.write(delim.join((acc, 'GRP'))+'\n')
                    sp_acc_ns.add(acc)

    elif str(d) == 'affy':
        with open('affy-probeset-ids.belns', 'w') as fp:
            for vals in d.get_ns_values():
                pid = vals
                fp.write(delim.join((pid, 'R'))+'\n')
#                    if pid not in affy_ns_dict:
#                        affy_ns_dict[pid] = 'R'

    elif str(d) == 'chebi':
        with open('chebi-names.belns', 'w') as fp, \
                open('chebi-ids.belns', 'w') as f:
            for vals in d.get_ns_values():
                name, primary_id, altIds = vals
                fp.write(delim.join((name, 'A'))+'\n')
                chebi_name_ns.add(name)
                f.write(delim.join((primary_id, 'A'))+'\n')
                chebi_id_ns.add(name)
                if altIds:
                    for i in altIds:
                        if i not in chebi_id_ns:
                            f.write(delim.join((i, 'A'))+'\n')
                        chebi_id_ns.add(i)

    elif str(d) == 'pubchem_namespace':
        with open('pubchem-ids.belns', 'w') as fp:
            for vals in d.get_ns_values():
                pid = vals
                fp.write(delim.join((pid, 'A'))+'\n')
                pub_ns.add(pid)

    elif str(d) == 'gobp':
        with open('go-biological-processes-names.belns', 'w') as gobp, \
                open('go-biological-processes-accession-numbers.belns', 'w') \
                as gobp_id:
            for vals in d.get_ns_values():
                termid, termname, altids = vals
                gobp.write(delim.join((termname, 'B'))+'\n')
                gobp_id.write(delim.join((termid, 'B'))+'\n')
                if altids is not None:
                    for alt in altids:
                        gobp_id.write(delim.join((alt, 'B'))+'\n')

    elif str(d) == 'gocc':

        with open('go-cellular-component-terms.belns', 'w') as gocc, \
                open('go-cellular-component-accession-numbers.belns', 'w') \
                as gocc_id:
            for vals in d.get_ns_values():
                termid, termname, altids, complex = vals
                if complex:
                    gocc.write(delim.join((termname, 'C'))+'\n')
                    gocc_id.write(delim.join((termid, 'C'))+'\n')
                    for alt in altids:
                        gocc_id.write(delim.join((alt, 'C'))+'\n')
                else:
                    gocc.write(delim.join((termname, 'A'))+'\n')
                    gocc_id.write(delim.join((termid, 'A'))+'\n')
                    if altids is not None:
                        for alt in altids:
                            gocc_id.write(delim.join((alt, 'A'))+'\n')

    elif str(d) == 'schem':
        schem_to_chebi = parsed.load_data('schem_to_chebi')
        count = 0
        with open('selventa-legacy-chemical-names.belns', 'w') as f:
            for entry in d.get_ns_values():
                # try to get a chebi equivalent, if there is one do not
                # write this value to the new namespace
                if schem_to_chebi.has_equivalence(entry):
                    count = count + 1
                    continue
                else:
                    f.write(delim.join((entry, 'A'))+'\n')
        if verbose:
            print('Able to resolve ' +str(count)+ ' SCHEM names to CHEBI.')

    elif str(d) == 'sdis':
        sdis_to_do = parsed.load_data('sdis_to_do')
        count = 0
        with open('selventa-legacy-diseases.belns', 'w') as f:
            for entry in d.get_ns_values():
                # try to get a do equivalent, if there is one do not
                # write this value to the new namespace
                if sdis_to_do.has_equivalence(entry):
                    count = count + 1
                    continue
                else:
                    f.write(delim.join((entry, 'O'))+'\n')
        if verbose:
            print('Able to resolve ' +str(count)+ ' SDIS names to DO.')

    elif str(d) == 'mesh':

        with open('mesh-cellular-locations.belns', 'w') as meshf, \
                open('mesh-diseases.belns', 'w') as meshd, \
                open('mesh-biological-processes.belns', 'w') as meshb:
            for vals in d.get_ns_values():
                ui, mh, mns, sts = vals
                # all entries from A11.284 branch (abundances)
                if any('A11.284' in branch for branch in mns):
                    meshf.write(delim.join((mh, 'A'))+'\n')
                # all entries from the C branch (pathology)
                elif any('C' in branch for branch in mns):
                    meshd.write(delim.join((mh, 'O'))+'\n')
                # G branch (bio process) - exclude G01 G02 G15 G17 branches
                elif any('G' in branch for branch in mns):
                    excluded = False
                    for branch in mns:
                        if branch.startswith('MN = G01') \
                                or branch.startswith('MN = G02') \
                                or branch.startswith('MN = G15') \
                                or branch.startswith('MN = G17'):
                            excluded = True
                    if not excluded:
                        meshb.write(delim.join((mh, 'B'))+'\n')

    elif str(d) == 'do':

        with open('disease-ontology-names.belns', 'w') as dn, \
                open('disease-ontology-ids.belns', 'w') as di:
            for vals in d.get_ns_values():
                name, id = vals
                dn.write(delim.join((name, 'O'))+'\n')
                di.write(delim.join((id, 'O'))+'\n')