Пример #1
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[UBERON, GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})
Пример #2
0
def test_uberon():
    uberon = [('UBERON:123', )]
    dict = {}
    glom(dict, uberon, unique_prefixes='UBERON')
    uber2 = [set(['UBERON:123', 'SOME:other'])]
    glom(dict, uber2, unique_prefixes='UBERON')
    print(dict)
Пример #3
0
def build_protein_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [UNIPROTKB, PR]
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    #Try to preserve some memory here.
    dicts.clear()
    baretype = PROTEIN.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {})
Пример #4
0
def test_simple():
    """Given 3 sets, 2 of which share a member, output 2 sets, with the sharing sets combined"""
    d = {}
    eqs = [('1', '2'), ('2', '3'), ('4', '5')]
    glom(d, eqs)
    assert len(d) == 5
    assert d['1'] == d['2'] == d['3'] == {'1', '2', '3'}
    assert d['4'] == d['5'] == {'4', '5'}
Пример #5
0
def test_sets():
    """Test using set() as opposed to {}"""
    d = {}
    eqs = [{'1', '2'}, set(['2', '3']), set(['4', '5']), set(['6', '7'])]
    oeqs = [{'5', '7'}]
    glom(d, eqs)
    glom(d, oeqs)
    assert d['1'] == d['2'] == d['3'] == {'1', '2', '3'}
    assert d['4'] == d['5'] == d['6'] == d['7'] == {'4', '5', '6', '7'}
Пример #6
0
def test_two_calls():
    """Test using glom iteratively. The first call joins the first two sets, then the second call joins
    the next two and the new set."""
    d = {}
    eqs = [('1', '2'), ('2', '3'), ('4', '5'), ('6', '7')]
    oeqs = [('5', '7')]
    glom(d, eqs)
    glom(d, oeqs)
    assert d['1'] == d['2'] == d['3'] == {'1', '2', '3'}
    assert d['4'] == d['5'] == d['6'] == d['7'] == {'4', '5', '6', '7'}
Пример #7
0
def test_bigger_sets():
    """Test when the sets have more than two members"""
    d = {}
    eqs = [{'1', '2', '3'}, {'4', '5', '6'}]
    glom(d, eqs)
    assert d['1'] == d['2'] == d['3'] == {'1', '2', '3'}
    assert d['4'] == d['5'] == d['6'] == {'4', '5', '6'}
    eqs = [{'3', '4', '6', '7'}]
    glom(d, eqs)
    assert d['1'] == d['2'] == d['3'] == d['4'] == d['5'] == d['6'] == d[
        '7'] == {'1', '2', '3', '4', '5', '6', '7'}
Пример #8
0
def build_compendia(identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = []
    for ifile in identifiers:
        print('loading', ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    genefam_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE_FAMILY.split(':')[-1]
    write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {})
Пример #9
0
def build_compendium(concordances, identifiers, mondoclose, badxrefs):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP])
        types.update(new_types)
    #Load close Mondos
    with open(mondoclose, 'r') as inf:
        close_mondos = defaultdict(set)
        for line in inf:
            x = tuple(line.strip().split('\t'))
            close_mondos[x[0]].add(x[1])
    #Load and glom concords
    for infile in concordances:
        print(infile)
        pairs = []
        pref = path.basename(infile)
        if pref in badxrefs:
            print('reading bad xrefs',pref)
            bad_pairs = read_badxrefs(badxrefs[pref])
        else:
            print('no bad pairs', pref)
            bad_pairs = set()
        with open(infile,'r') as inf:
            for line in inf:
                stuff = line.strip().split('\t')
                x = tuple( [stuff[0], stuff[2]] )
                if len(x) != 2:
                    print(x)
                    exit()
                if x not in bad_pairs:
                    pairs.append( x )
        if pref in ['MONDO','HP','EFO']:
            newpairs = remove_overused_xrefs(pairs)
        else:
            newpairs = pairs
        glom(dicts, newpairs, unique_prefixes=[MONDO, HP], close={MONDO:close_mondos})
        try:
            print(dicts['OMIM:607644'])
        except:
            print('notyet')
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types)
    for biotype,sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets,f'{baretype}.txt',biotype,{})
Пример #10
0
def combine_unichem(concordances,output):
    dicts = {}
    for infile in concordances:
        print(infile)
        print('loading',infile)
        pairs = []
        with open(infile,'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append( set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[INCHIKEY])
    chem_sets = set([frozenset(x) for x in dicts.values()])
    with jsonlines.open(output, mode='w') as writer:
        for chemset in chem_sets:
            writer.write(list(chemset))
Пример #11
0
def build_gene_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [NCBIGENE,HGNC,ENSEMBL,OMIM]
    for ifile in identifiers:
        print('loading',ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes= uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', GENE, {})
Пример #12
0
def build_compendia(concordances, identifiers,unichem_partial):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = read_partial_unichem(unichem_partial)
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[INCHIKEY])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading',infile)
        pairs = []
        with open(infile,'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append( set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[INCHIKEY])
    chem_sets = set([frozenset(x) for x in dicts.values()])
    baretype = CHEMICAL_SUBSTANCE.split(':')[-1]
    write_compendium(chem_sets, f'{baretype}.txt', CHEMICAL_SUBSTANCE, {})
Пример #13
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    #These are concords that cause problems and are being special cased out.  In disease/process we put these in some
    # files, and maybe we should here too?
    #GO:0034227/EC:2.8.1.4 is because that go term is a biological process, but EC is not a valid prefix for that,
    #  leading to a loss of the EC term (and a unified RHEA) on output.
    bad_concords = set(frozenset(['GO:0034227', 'EC:2.8.1.4']))
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pair = frozenset([x[0], x[2]])
                if pair not in bad_concords:
                    pairs.append(pair)
        #one kind of error is that GO->Reactome xrefs are freqently more like subclass relations. So
        # GO:0004674 (protein serine/threonine kinase) has over 400 Reactome xrefs
        # remove_overused_xrefs assumes that we want to remove pairs where the second pair is overused
        # but this case it's the first, so we use the bothways optoin
        newpairs = remove_overused_xrefs(pairs, bothways=True)
        glom(dicts, newpairs, unique_prefixes=[GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})
Пример #14
0
def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs):
    #print('disease/phenotype')
    #print('get and write hp sets')
    #bad_mappings = read_bad_hp_mappings(badhpos)
    #more_bad_mappings = read_badxrefs(badhpoxrefs)
    #for h,m in more_bad_mappings.items():
    #    bad_mappings[h].update(m)
    #hpo_sets,labels = build_sets('HP:0000118', ignore_list = ['ICD','NCIT'], bad_mappings = bad_mappings)
    #print('filter')
    hpo_sets = filter_out_non_unique_ids(hpo_sets)
    #print('ok')
    #dump_sets(hpo_sets,'hpo_sets.txt')
    print('get and write mondo sets')
    #MONDO has disease, and its sister disease susceptibility.  I'm putting both in disease.  Biolink q
    #But! this is a problem right now because there are some things that go in both, and they are getting filtered out
    bad_mondo_mappings = read_badxrefs('mondo')
    mondo_sets_1,labels_1 = build_exact_sets('MONDO:0000001',bad_mondo_mappings)
    mondo_sets_2,labels_2 = build_exact_sets('MONDO:0042489',bad_mondo_mappings)
    mondo_close = get_close_matches('MONDO:0000001')
    mondo_close2 = get_close_matches('MONDO:0042489')
    for k,v in mondo_close2.items():
        mondo_close[k] = v
    dump_sets(mondo_sets_1,'mondo1.txt')
    dump_sets(mondo_sets_2,'mondo2.txt')
    labels.update(labels_1)
    labels.update(labels_2)
    #if we just add these together, then any mondo in both lists will get filtered out in the next step.
    #so we need to put them into a set.  You can't put sets directly into a set, you have to freeze them first
    mondo_sets = combine_id_sets(mondo_sets_1,mondo_sets_2)
    mondo_sets = filter_out_non_unique_ids(mondo_sets)
    dump_sets(mondo_sets,'mondo_sets.txt')
    print('get and write umls sets')
    bad_umls = read_badxrefs('umls')
    meddra_umls,secondary_meddra_umls = read_meddra(bad_umls)
    meddra_umls = filter_umls(meddra_umls,mondo_sets+hpo_sets,'filtered.txt')
    secondary_meddra_umls = filter_umls(secondary_meddra_umls,mondo_sets+hpo_sets,'filtered_secondary.txt')
    #Now, if we just use all the secondary links, things get too agglommed.
    # So instead, lets filter these again.
    meddra_umls += filter_secondaries(secondary_meddra_umls,'double_filter.txt')
    dump_sets(meddra_umls,'meddra_umls_sets.txt')
    dicts = {}
    #EFO has 3 parts that we want here:
    # Disease
    efo_sets_1,l = build_exact_sets('EFO:0000408')
    labels.update(l)
    #phenotype
    efo_sets_2,l = build_exact_sets('EFO:0000651')
    labels.update(l)
    #measurement
    efo_sets_3,l = build_exact_sets('EFO:0001444')
    labels.update(l)
    efo_sets_a = combine_id_sets(efo_sets_1,efo_sets_2)
    efo_sets = combine_id_sets(efo_sets_a, efo_sets_3)
    efo_sets = filter_out_non_unique_ids(efo_sets)
    dump_sets(efo_sets,'efo_sets.txt')
    print('put it all together')
    print('mondo')
    glom(dicts,mondo_sets,unique_prefixes=['MONDO'])
    dump_dicts(dicts,'mondo_dicts.txt')
    print('hpo')
    glom(dicts,hpo_sets,unique_prefixes=['MONDO'],pref='HP')
    dump_dicts(dicts,'mondo_hpo_dicts.txt')
    print('umls')
    glom(dicts,meddra_umls,unique_prefixes=['MONDO','HP'],pref='UMLS',close={'MONDO':mondo_close})
    dump_dicts(dicts,'mondo_hpo_meddra_dicts.txt')
    print('efo')
    glom(dicts,efo_sets,unique_prefixes=['MONDO','HP'],pref='EFO')
    dump_dicts(dicts,'mondo_hpo_meddra_efo_dicts.txt')
    print('dump it')
    fs = set([frozenset(x) for x in dicts.values()])
    diseases,phenotypes = create_typed_sets(fs)
    write_compendium(diseases,'disease.txt','biolink:Disease',labels)
    write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels)