Пример #1
0
def test_missingsym():
    """Tests read a GAF with missing (required) DB_Symbol text."""
    # Original gaf file (mgi.gaf) was reduced
    fin_gaf = "tests/data/gaf_missingsym.mgi"
    # Test that gene products that are missing the required DB_Symbol are ignored
    gafobj = GafReader(fin_gaf, hdr_only=False)
    assert not gafobj.chk_associations('gaf_missingsym.err')
Пример #2
0
def read_gaf(fin_gaf, prt=sys.stdout, **kws):
    """Read Gene Association File (GAF). Return data."""
    # keyword arguments what is read from GAF.
    hdr_only = kws.get('hdr_only', None)  # Read all data from GAF by default
    # Read GAF file
    gafobj = GafReader(fin_gaf, hdr_only, prt, **kws)
    return gafobj.read_gaf(**kws)
Пример #3
0
def test_gaf_illegal(prt=sys.stdout):
    """Test finding and reporting illegal GAF lines seen in the field."""
    fin_gaf = os.path.join(REPO, 'data/gaf/goa_human_illegal.gaf')
    gafobj = GafReader(fin_gaf, hdr_only=False, prt=prt)
    # id2gos = gafobj.read_gaf()  # Read associations
    # for ntd in gafobj.associations:
    #     print(ntd)
    assert not gafobj.chk_associations('goa_human_illegal.err')
Пример #4
0
def test_gaf_illegal(prt=sys.stdout):
    """Test finding and reporting illegal GAF lines seen in the field."""
    fin_gaf = os.path.join(REPO, 'data/gaf/goa_human_illegal.gaf')
    gafobj = GafReader(fin_gaf, hdr_only=False, prt=prt)
    # id2gos = gafobj.read_gaf()  # Read associations
    # for ntd in gafobj.associations:
    #     print(ntd)
    assert not gafobj.chk_associations('goa_human_illegal.err')
Пример #5
0
def test_gaf_illegal(prt=sys.stdout):
    """Test finding and reporting illegal GAF lines seen in the field."""
    fin_gaf = os.path.join(REPO, 'data/gaf/goa_human_illegal.gaf')
    gafobj = GafReader(fin_gaf, hdr_only=False, prt=prt)
    id2gos = gafobj.read_gaf()  # Read associations
    assert len(id2gos) == 15, "IDS FOUND: EXP({E}) ACT({A})".format(
        E=15, A=len(id2gos))
    assert len(gafobj.datobj.ignored) == 1
    assert len(gafobj.datobj.illegal_lines['ILLEGAL TAXON']) == 1
Пример #6
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    not_found = set()
    errs = []
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        fin_gaf = os.path.join(REPO, assc_name)
        if not os.path.exists(fin_gaf):
            dnld_annotation(fin_gaf)
        annoobj = GafReader(fin_gaf)
        #### for nspc in ['BP', 'MF', 'CC']:
        assc_gene2gos = annoobj.get_id2gos('all')
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # Initialize the counts of each GO term.
        tcntobj = TermCounts(go2obj, assc_gene2gos)
        go_cnt = tcntobj.gocnts.most_common()

        #print tcntobj.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(tcntobj.aspect_counts)
            gocnt_max = go_cnt[0][1]
            prt_info(tcntobj, go_cnt, None)
            prt_info(tcntobj, go_cnt, gocnt_max / 2.0)
            prt_info(tcntobj, go_cnt, gocnt_max / 10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name,
                                            HMS=_hms(TIC),
                                            hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
    if errs:
        fout_err = 'namespace_errors.txt'
        with open(fout_err, 'w') as prt:
            for err in errs:
                prt.write(err)
            print('  {N} ERRORS WROTE: {TXT}'.format(N=len(errs),
                                                     TXT=fout_err))
Пример #7
0
def test_i148b_semsim_lin(do_plt=False):
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo"))
    annoobj = GafReader(fin_gaf, godag=godag)

    associations = annoobj.get_id2gos('CC')
    tcntobj = TermCounts(godag, associations)

    if do_plt:
        _do_plt(tcntobj, godag)

    goids = list(godag.keys())

    ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0))
    ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0))
    ##return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
    return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
Пример #8
0
def read_gaf(fin_gaf, prt=sys.stdout, **kws):
    """Read Gene Association File (GAF). Return data."""
    # keyword arguments for choosing which GO IDs to keep
    taxid2asscs = kws.get('taxid2asscs', None)
    b_geneid2gos = not kws.get('go2geneids', False)
    evs = kws.get('evidence_set', None)
    eval_nd = get_nd(kws.get('keep_ND', False))
    eval_not = get_not(kws.get('keep_NOT', False))
    # keyword arguments what is read from GAF.
    hdr_only = kws.get('hdr_only', None)  # Read all data from GAF by default
    # Read GAF file
    from goatools.anno.gaf_reader import GafReader
    # Simple associations
    id2gos = defaultdict(set)
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
    gafobj = GafReader(fin_gaf, hdr_only, prt)
    # Optionally specify a subset of GOs based on their evidence.
    # By default, return id2gos. User can cause go2geneids to be returned by:
    #   >>> read_ncbi_gene2go(..., go2geneids=True
    for ntgaf in gafobj.associations:
        if eval_nd(ntgaf) and eval_not(ntgaf):
            if evs is None or ntgaf.Evidence_Code in evs:
                taxid = ntgaf.Taxon[0]
                geneid = ntgaf.DB_ID
                go_id = ntgaf.GO_ID
                if b_geneid2gos:
                    id2gos[geneid].add(go_id)
                else:
                    id2gos[go_id].add(geneid)
                if taxid2asscs is not None:
                    taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id)
                    taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid)
    return id2gos  # return simple associations
Пример #9
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    assc_names = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        assc_names = [usr_assc]
    not_found = set()
    gaf2errs = cx.defaultdict(list)
    for assc_name in assc_names:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        fin_gaf = os.path.join(REPO, assc_name)
        if not os.path.exists(fin_gaf):
            dnld_annotation(fin_gaf)
        annoobj = GafReader(fin_gaf)
        for nta in annoobj.associations:
            if nta.GO_ID in go2obj:
                goterm = go2obj[nta.GO_ID]
                namespace_anno = NS2NAMESPACE.get(nta.NS)
                if namespace_anno != goterm.namespace:
                    gaf2errs[assc_name].append(nta)
            else:
                not_found.add(nta.GO_ID)
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(assc_names)))
    if not_found:
        _prt_not_found(not_found)
    if gaf2errs:
        _wr_errs('namespace_errors.txt', gaf2errs, go2obj)
Пример #10
0
def get_objanno(fin_anno, anno_type=None, **kws):
    """Read annotations in GAF, GPAD, Entrez gene2go, or text format."""
    # kws get_objanno: taxids hdr_only prt allow_missing_symbol
    anno_type = get_anno_desc(fin_anno, anno_type)
    if anno_type is not None:
        if anno_type == 'gene2go':
            # kws: taxid taxids
            kws_ncbi = {
                k: kws[k]
                for k in Gene2GoReader.exp_kws.intersection(kws.keys())
            }
            return Gene2GoReader(fin_anno, **kws_ncbi)
        if anno_type == 'gaf':
            kws_gaf = {
                k: kws[k]
                for k in GafReader.exp_kws.intersection(kws.keys())
            }
            return GafReader(fin_anno, **kws_gaf)
        if anno_type == 'gpad':
            kws_gpad = {
                k: kws[k]
                for k in GpadReader.exp_kws.intersection(kws.keys())
            }
            return GpadReader(fin_anno, **kws_gpad)
        if anno_type == 'id2gos':
            kws_id2go = {
                k: kws[k]
                for k in IdToGosReader.exp_kws.intersection(kws.keys())
            }
            return IdToGosReader(fin_anno, **kws_id2go)
    raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format(
        F=fin_anno, D=anno_type))
Пример #11
0
def read_gaf(fin_gaf,
             prt=sys.stdout,
             hdr_only=False,
             namespace='BP',
             allow_missing_symbol=False,
             **kws):
    """Read Gene Association File (GAF). Return data."""
    return GafReader(fin_gaf,
                     hdr_only=hdr_only,
                     prt=prt,
                     allow_missing_symbol=allow_missing_symbol,
                     godag=kws.get('godag')).get_id2gos(namespace, **kws)
Пример #12
0
def test_i195():
    """Investigate GAF reading error on saccharomyces"""
    fin_gaf1 = join(REPO, 'sgd.gaf')
    dnld_gaf1 = 'http://current.geneontology.org/annotations/sgd.gaf.gz'
    _dnld_gaf(fin_gaf1, dnld_gaf1)

    fin_gaf2 = join(REPO, 'gene_association.sgd.gaf')
    dnld_gaf2 = 'http://downloads.yeastgenome.org/curation/literature/gene_association.sgd.gaf.gz'
    _dnld_gaf(fin_gaf2, dnld_gaf2)

    # Read GAF
    print('READING: {GAF}'.format(GAF=basename(fin_gaf1)))
    ##DVK objanno_sgd1 = GafReader(fin_gaf1)

    print('READING: {GAF}'.format(GAF=basename(fin_gaf2)))
    objanno_sgd2 = GafReader(fin_gaf2)
Пример #13
0
def my_read_gaf(fin_gaf, prt=sys.stdout, before_date=None, **kws):
    """Read Gene Association File (GAF).

    # Arguments
        before_date: int, only consider annotation before this date (YYYYMMDD)

    # Returns
        dict: maps gene IDs to the GO terms it is annotated them
    """
    # keyword arguments for choosing which GO IDs to keep
    taxid2asscs = kws.get('taxid2asscs', None)
    b_geneid2gos = not kws.get('go2geneids', False)
    evs = kws.get('evidence_set', None)
    eval_nd = get_nd(kws.get('keep_ND', False))
    eval_not = get_not(kws.get('keep_NOT', False))
    # keyword arguments what is read from GAF.
    hdr_only = kws.get('hdr_only', None)  # Read all data from GAF by default
    # Read GAF file
    # Simple associations
    id2gos = defaultdict(set)
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
    gafobj = GafReader(fin_gaf, hdr_only, prt, **kws)
    # Optionally specify a subset of GOs based on their evidence.
    # By default, return id2gos. User can cause go2geneids to be returned by:
    #   >>> read_ncbi_gene2go(..., go2geneids=True
    for idx, ntgaf in enumerate(gafobj.associations):
        if eval_nd(ntgaf) and eval_not(ntgaf):
            if evs is None or ntgaf.Evidence_Code in evs:

                # My addition to GOATOOLS function
                if before_date:
                    return ntgaf, idx
                    if int(ntgaf.Date) > before_date:
                        continue

                taxid = ntgaf.Taxon[0]
                geneid = ntgaf.DB_ID
                go_id = ntgaf.GO_ID
                if b_geneid2gos:
                    id2gos[geneid].add(go_id)
                else:
                    id2gos[go_id].add(geneid)
                if taxid2asscs is not None:
                    taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id)
                    taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid)
    return id2gos  # return simple associations
Пример #14
0
def test_anno_read():
    """Test reading annotation file."""
    fin_anno = os.path.join(REPO, 'goa_human.gaf')
    dnld_annofile(fin_anno, 'gaf')

    print('\nTEST STORING ONLY ONE SPECIES')
    obj = GafReader(fin_anno)
    obj.prt_summary_anno2ev()
    ## new = obj.read_gaf()
    new = obj.get_id2gos_nss()
    old = read_gaf(obj)
    _prt_differences(new, old, obj)
    print('{N} NEW'.format(N=len(new)))
    print('{N} OLD'.format(N=len(old)))
    assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old))

    print('\nTEST KWS: keep_ND and keep_NOT')
    # pylint: disable=bad-whitespace
    kws_lst = [
        {'keep_ND': False, 'keep_NOT': False},
        {'keep_ND': False, 'keep_NOT': True},
        {'keep_ND': True,  'keep_NOT': False},
        {'keep_ND': True,  'keep_NOT': True},
    ]
    for kws in kws_lst:
        print('\nTEST KWS:', kws)
        ## new = obj.read_gaf(namespace='BP', **kws)
        new = obj.get_id2gos_nss(**kws)
        old = read_gaf(obj, **kws)
        _prt_differences(new, old, obj)
        assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old))

    print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES')
    ## new = obj.read_gaf(go2geneids=True)
    new = obj.get_id2gos_nss(go2geneids=True)
    old = read_gaf(obj, go2geneids=True)
    _prt_differences(new, old, obj)
    assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old))

    print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES')
    evcodes = set(['ISO', 'IKR'])
    print("\nTEST 9606 ev_include={CODES}".format(CODES=' '.join(evcodes)))
    ## new = obj.read_gaf(ev_include=evcodes)
    new = obj.get_id2gos_nss(ev_include=evcodes)
    old = read_gaf(obj, ev_include=evcodes)
    _prt_differences(new, old, obj)
    assert new == old
Пример #15
0
def main(planteome_dir, output_dir, file_prefix):

    # Get names of GAF files
    print('\nGetting names of .assoc files...')
    gaf_files = []
    for f in listdir(planteome_dir):
        if f.endswith('.assoc'):
            gaf_files.append(join(planteome_dir, f))

    # Read in GAF files
    print('\nReading in files...')
    gaf_dicts = {}
    problem_files = []
    for f in tqdm(gaf_files):
        try:
            gaf = GafReader(f)
        except:
            print(f'Exception raised in GAF parsing, skipping file')
            problem_files.append(f)
        else:
            gaf_dicts[f] = gaf

    # Get keywords
    print('\nGetting keywords...')
    names = defaultdict(set)
    for f, gaf in tqdm(gaf_dicts.items()):
        for namedTup in gaf.associations:
            name = namedTup.DB_Name
            syns = namedTup.DB_Synonym
            names[f].update(name)
            names[f].update(syns)
    for key, value in names.items():
        names[key] = list(value)

    # Write out files
    print('\nWriting output files...')
    with open(f'{output_dir}/{file_prefix}_keywords.json', 'w') as f:
        json.dump(names, f)
    with open(f'{output_dir}/{file_prefix}_problem_files.txt', 'w') as f:
        f.write('\n'.join(problem_files))

    print('\nDone!')
Пример #16
0
    network = 'thr'  # 'complete', 'backbone', 'threshold'
    threshold = 0.5
    layer = 'DM'
    threshold_str = str(threshold).replace('.', 'p')

    # GO Information
    dict_annotation_file = {
        'HS': 'goa_human.gaf',
        'MM': 'mgi.gaf',
        'DM': 'fb.gaf'
    }
    annotation = '../data/GeneOntology/' + dict_annotation_file[layer]
    ontology = '../data/GeneOntology/go-basic.obo'
    #
    godag = obo_parser.GODag(ontology)
    gaf = GafReader(name='GAF ' + layer, filename=annotation, godag=godag)
    # Dict of Associations
    ns2assoc = gaf.get_ns2assc()

    # Load Population of Genes (for background comparison)
    rFPKMFile = '../02-core_genes/results/FPKM/{layer:s}/{layer:s}-FPKM-{celltype:s}.csv.gz'.format(
        celltype=celltype, layer=layer)
    dfP = pd.read_csv(rFPKMFile, usecols=['id_gene', 'gene'])

    # Load PCA
    rPCAFile = 'results/pca/{celltype:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-dim.csv.gz'.format(
        celltype=celltype,
        network=network,
        threshold=threshold_str,
        layer=layer)
    df_pca = pd.read_csv(rPCAFile, index_col=0, encoding='utf-8')
Пример #17
0
def get_gaf_hdr(fin_gaf):
    """Read Gene Association File (GAF). Return GAF version and data info."""
    return GafReader(fin_gaf, hdr_only=True).hdr
Пример #18
0
def test_anno_read():
    """Test reading annotation file."""
    fin_anno = os.path.join(REPO, 'goa_human.gaf')
    dnld_annofile(fin_anno, 'gaf')

    print('\nTEST STORING ONLY ONE SPECIES')
    obj = GafReader(fin_anno)
    obj.prt_summary_anno2ev()
    ## new = obj.read_gaf()
    new = obj.get_id2gos_nss()
    old = read_gaf(obj)
    _prt_differences(new, old, obj)
    print('{N} NEW'.format(N=len(new)))
    print('{N} OLD'.format(N=len(old)))
    assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new),
                                                               O=len(old))

    print('\nTEST KWS: keep_ND and keep_NOT')
    # pylint: disable=bad-whitespace
    kws_lst = [
        {
            'keep_ND': False,
            'keep_NOT': False
        },
        {
            'keep_ND': False,
            'keep_NOT': True
        },
        {
            'keep_ND': True,
            'keep_NOT': False
        },
        {
            'keep_ND': True,
            'keep_NOT': True
        },
    ]
    for kws in kws_lst:
        print('\nTEST KWS:', kws)
        ## new = obj.read_gaf(namespace='BP', **kws)
        new = obj.get_id2gos_nss(**kws)
        old = read_gaf(obj, **kws)
        _prt_differences(new, old, obj)
        assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new),
                                                                   O=len(old))

    print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES')
    ## new = obj.read_gaf(go2geneids=True)
    new = obj.get_id2gos_nss(go2geneids=True)
    old = read_gaf(obj, go2geneids=True)
    _prt_differences(new, old, obj)
    assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new),
                                                               O=len(old))

    print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES')
    evcodes = set(['ISO', 'IKR'])
    print("\nTEST 9606 ev_include={CODES}".format(CODES=' '.join(evcodes)))
    ## new = obj.read_gaf(ev_include=evcodes)
    new = obj.get_id2gos_nss(ev_include=evcodes)
    old = read_gaf(obj, ev_include=evcodes)
    _prt_differences(new, old, obj)
    assert new == old
Пример #19
0
def get_gaf_hdr(fin_gaf):
    """Read Gene Association File (GAF). Return GAF version and data info."""
    from goatools.anno.gaf_reader import GafReader
    return GafReader(fin_gaf, hdr_only=True).hdr
Пример #20
0
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_flybase.union(pop_uniprot)
    elif layer == 'MM':
        pop_mgi = set(dfQ['MGI ID'].dropna().tolist())
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_mgi.union(pop_uniprot)
    elif layer == 'HS':
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_uniprot

    # Load GO
    print("Load GO files")
    godag = obo_parser.GODag(ontology)

    # GAF files for both MM and DM
    gaf_species = GafReader(name='GAF ' + layer + ' Specie', filename=annotation, godag=godag, namespaces=set(['BP']))
    gaf_reactome = GafReader(name='GAF ' + layer + ' Reactome', filename=annotation_reactome, godag=godag, namespaces=set(['BP']))
    # Dict of Associations
    ns2assoc_species = gaf_species.get_ns2assc()
    n_assoc_species = sum([len(v) for k, v in ns2assoc_species['BP'].items()])
    print('Specie associations: {n:d}'.format(n=n_assoc_species))

    # We also need to add the multi-species annotations
    ns2assoc_reactome = gaf_reactome.get_ns2assc()
    n_assoc_reactome = sum([len(v) for k, v in ns2assoc_reactome['BP'].items()])
    print('Reactome associations: {n:d}'.format(n=n_assoc_reactome))

    # combine associations
    ns2assoc_combined = merge(ns2assoc_species, ns2assoc_reactome)
    n_assoc_combined = sum([len(v) for k, v in ns2assoc_combined['BP'].items()])
    print('Combined associations: {n:d}'.format(n=n_assoc_combined))