def test_knownGene():
    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownGene.txt.gz'
    # Mirror. Slightly faster and more stable, I believe:
    knownGene_url = 'http://kt.era.ee/distribute/pyintervaltree/knownGene.txt.gz'

    # To speed up testing, we'll download the file and reuse the downloaded copy
    knownGene_file, headers = urlretrieve(knownGene_url)
    
    knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file)
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, decompress=True) # Py3 downloads .gz files to local files with names not ending with .gz
    assert len(knownGene) == 82960
    result = knownGene[b'chr1'].search(100000, 138529)
    assert len(result) == 1
    assert list(result)[0].data['name'] == b'uc021oeg.2'
    
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='cds', decompress=True)
    assert len(knownGene) == 82960
    assert not knownGene[b'chr1'].overlaps(100000, 138529)
    
    knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='exons', decompress=True)
    assert len(knownGene) == 742493
    result = list(knownGene[b'chr1'].search(134772, 140566))
    assert len(result) == 3
    assert result[0].data == result[1].data and result[0].data == result[2].data
Exemplo n.º 2
0
def _test_promotorsearch():
    # Realistic example: find a promotor of a given gene ('NANOG', for example)
    # It is slow, so you don't want to run it too much.
    
    from intervaltree.bio import GenomeIntervalTree, UCSCTable
    # Download refGene table
    refGene = GenomeIntervalTree.from_table(url='http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz', parser=UCSCTable.REF_GENE)
    # Find the NANOG gene
    nanog = [i for chrom in refGene for i in refGene[chrom] if i.data['name2'] == 'NANOG']
    nanog = nanog[0]
    
    # Download genome segmentation table
    e = Encode()
    segments = e.AwgSegmentation.CombinedHepg2.fetch().read_as_intervaltree()
    
    # Find the segmentation of the NANOG transcript +- 10kb
    results = segments[nanog.data['chrom']].search(nanog.begin-10000, nanog.end+10000)
    
    # Leave the promotor/promotor flanking segments only
    results = [i for i in results if i.data[0] in ['PF', 'P']]
    print results
def test_refGene():
    # Smoke-test for refGene
    refGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz'
    refGene_url = 'http://kt.era.ee/distribute/pyintervaltree/refGene.txt.gz'
    refGene = GenomeIntervalTree.from_table(url=refGene_url, mode='tx', parser=UCSCTable.REF_GENE)
    assert len(refGene) == 52350  # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same.
def test_ensGene():
    # Smoke-test we can at least read ensGene.
    ensGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/ensGene.txt.gz'
    ensGene_url = 'http://kt.era.ee/distribute/pyintervaltree/ensGene.txt.gz'
    ensGene = GenomeIntervalTree.from_table(url=ensGene_url, mode='cds', parser=UCSCTable.ENS_GENE)
    assert len(ensGene) == 204940