def build_compendia(concordances, identifiers): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} for ifile in identifiers: print(ifile) new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO]) types.update(new_types) for infile in concordances: print(infile) print('loading', infile) pairs = [] with open(infile, 'r') as inf: for line in inf: x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) newpairs = remove_overused_xrefs(pairs) glom(dicts, newpairs, unique_prefixes=[UBERON, GO]) typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]), types) for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] write_compendium(sets, f'{baretype}.txt', biotype, {})
def build_protein_compendia(concordances, identifiers): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} uniques = [UNIPROTKB, PR] for ifile in identifiers: print(ifile) new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=uniques) types.update(new_types) for infile in concordances: print(infile) print('loading', infile) pairs = [] with open(infile, 'r') as inf: for line in inf: x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) glom(dicts, pairs, unique_prefixes=uniques) gene_sets = set([frozenset(x) for x in dicts.values()]) #Try to preserve some memory here. dicts.clear() baretype = PROTEIN.split(':')[-1] write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {})
def build_compendia(identifiers): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} uniques = [] for ifile in identifiers: print('loading', ifile) new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=uniques) types.update(new_types) genefam_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE_FAMILY.split(':')[-1] write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {})
def build_compendium(concordances, identifiers, mondoclose, badxrefs): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} for ifile in identifiers: print(ifile) new_identifiers,new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP]) types.update(new_types) #Load close Mondos with open(mondoclose, 'r') as inf: close_mondos = defaultdict(set) for line in inf: x = tuple(line.strip().split('\t')) close_mondos[x[0]].add(x[1]) #Load and glom concords for infile in concordances: print(infile) pairs = [] pref = path.basename(infile) if pref in badxrefs: print('reading bad xrefs',pref) bad_pairs = read_badxrefs(badxrefs[pref]) else: print('no bad pairs', pref) bad_pairs = set() with open(infile,'r') as inf: for line in inf: stuff = line.strip().split('\t') x = tuple( [stuff[0], stuff[2]] ) if len(x) != 2: print(x) exit() if x not in bad_pairs: pairs.append( x ) if pref in ['MONDO','HP','EFO']: newpairs = remove_overused_xrefs(pairs) else: newpairs = pairs glom(dicts, newpairs, unique_prefixes=[MONDO, HP], close={MONDO:close_mondos}) try: print(dicts['OMIM:607644']) except: print('notyet') typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] write_compendium(sets,f'{baretype}.txt',biotype,{})
def build_gene_compendia(concordances, identifiers): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} uniques = [NCBIGENE,HGNC,ENSEMBL,OMIM] for ifile in identifiers: print('loading',ifile) new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes= uniques) types.update(new_types) for infile in concordances: print(infile) print('loading', infile) pairs = [] with open(infile, 'r') as inf: for line in inf: x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) glom(dicts, pairs, unique_prefixes=uniques) gene_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE.split(':')[-1] write_compendium(gene_sets, f'{baretype}.txt', GENE, {})
def build_compendia(concordances, identifiers,unichem_partial): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = read_partial_unichem(unichem_partial) types = {} for ifile in identifiers: print(ifile) new_identifiers,new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=[INCHIKEY]) types.update(new_types) for infile in concordances: print(infile) print('loading',infile) pairs = [] with open(infile,'r') as inf: for line in inf: x = line.strip().split('\t') pairs.append( set([x[0], x[2]])) newpairs = remove_overused_xrefs(pairs) glom(dicts, newpairs, unique_prefixes=[INCHIKEY]) chem_sets = set([frozenset(x) for x in dicts.values()]) baretype = CHEMICAL_SUBSTANCE.split(':')[-1] write_compendium(chem_sets, f'{baretype}.txt', CHEMICAL_SUBSTANCE, {})
def build_compendia(concordances, identifiers): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" #These are concords that cause problems and are being special cased out. In disease/process we put these in some # files, and maybe we should here too? #GO:0034227/EC:2.8.1.4 is because that go term is a biological process, but EC is not a valid prefix for that, # leading to a loss of the EC term (and a unified RHEA) on output. bad_concords = set(frozenset(['GO:0034227', 'EC:2.8.1.4'])) dicts = {} types = {} for ifile in identifiers: print(ifile) new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes=[GO]) types.update(new_types) for infile in concordances: print(infile) print('loading', infile) pairs = [] with open(infile, 'r') as inf: for line in inf: x = line.strip().split('\t') pair = frozenset([x[0], x[2]]) if pair not in bad_concords: pairs.append(pair) #one kind of error is that GO->Reactome xrefs are freqently more like subclass relations. So # GO:0004674 (protein serine/threonine kinase) has over 400 Reactome xrefs # remove_overused_xrefs assumes that we want to remove pairs where the second pair is overused # but this case it's the first, so we use the bothways optoin newpairs = remove_overused_xrefs(pairs, bothways=True) glom(dicts, newpairs, unique_prefixes=[GO]) typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]), types) for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] write_compendium(sets, f'{baretype}.txt', biotype, {})
def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs): #print('disease/phenotype') #print('get and write hp sets') #bad_mappings = read_bad_hp_mappings(badhpos) #more_bad_mappings = read_badxrefs(badhpoxrefs) #for h,m in more_bad_mappings.items(): # bad_mappings[h].update(m) #hpo_sets,labels = build_sets('HP:0000118', ignore_list = ['ICD','NCIT'], bad_mappings = bad_mappings) #print('filter') hpo_sets = filter_out_non_unique_ids(hpo_sets) #print('ok') #dump_sets(hpo_sets,'hpo_sets.txt') print('get and write mondo sets') #MONDO has disease, and its sister disease susceptibility. I'm putting both in disease. Biolink q #But! this is a problem right now because there are some things that go in both, and they are getting filtered out bad_mondo_mappings = read_badxrefs('mondo') mondo_sets_1,labels_1 = build_exact_sets('MONDO:0000001',bad_mondo_mappings) mondo_sets_2,labels_2 = build_exact_sets('MONDO:0042489',bad_mondo_mappings) mondo_close = get_close_matches('MONDO:0000001') mondo_close2 = get_close_matches('MONDO:0042489') for k,v in mondo_close2.items(): mondo_close[k] = v dump_sets(mondo_sets_1,'mondo1.txt') dump_sets(mondo_sets_2,'mondo2.txt') labels.update(labels_1) labels.update(labels_2) #if we just add these together, then any mondo in both lists will get filtered out in the next step. #so we need to put them into a set. You can't put sets directly into a set, you have to freeze them first mondo_sets = combine_id_sets(mondo_sets_1,mondo_sets_2) mondo_sets = filter_out_non_unique_ids(mondo_sets) dump_sets(mondo_sets,'mondo_sets.txt') print('get and write umls sets') bad_umls = read_badxrefs('umls') meddra_umls,secondary_meddra_umls = read_meddra(bad_umls) meddra_umls = filter_umls(meddra_umls,mondo_sets+hpo_sets,'filtered.txt') secondary_meddra_umls = filter_umls(secondary_meddra_umls,mondo_sets+hpo_sets,'filtered_secondary.txt') #Now, if we just use all the secondary links, things get too agglommed. # So instead, lets filter these again. meddra_umls += filter_secondaries(secondary_meddra_umls,'double_filter.txt') dump_sets(meddra_umls,'meddra_umls_sets.txt') dicts = {} #EFO has 3 parts that we want here: # Disease efo_sets_1,l = build_exact_sets('EFO:0000408') labels.update(l) #phenotype efo_sets_2,l = build_exact_sets('EFO:0000651') labels.update(l) #measurement efo_sets_3,l = build_exact_sets('EFO:0001444') labels.update(l) efo_sets_a = combine_id_sets(efo_sets_1,efo_sets_2) efo_sets = combine_id_sets(efo_sets_a, efo_sets_3) efo_sets = filter_out_non_unique_ids(efo_sets) dump_sets(efo_sets,'efo_sets.txt') print('put it all together') print('mondo') glom(dicts,mondo_sets,unique_prefixes=['MONDO']) dump_dicts(dicts,'mondo_dicts.txt') print('hpo') glom(dicts,hpo_sets,unique_prefixes=['MONDO'],pref='HP') dump_dicts(dicts,'mondo_hpo_dicts.txt') print('umls') glom(dicts,meddra_umls,unique_prefixes=['MONDO','HP'],pref='UMLS',close={'MONDO':mondo_close}) dump_dicts(dicts,'mondo_hpo_meddra_dicts.txt') print('efo') glom(dicts,efo_sets,unique_prefixes=['MONDO','HP'],pref='EFO') dump_dicts(dicts,'mondo_hpo_meddra_efo_dicts.txt') print('dump it') fs = set([frozenset(x) for x in dicts.values()]) diseases,phenotypes = create_typed_sets(fs) write_compendium(diseases,'disease.txt','biolink:Disease',labels) write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels)