def test_knownGene(): # To speed up testing, we'll download the file and reuse the downloaded copy knownGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownGene.txt.gz' # Mirror. Slightly faster and more stable, I believe: knownGene_url = 'http://kt.era.ee/distribute/pyintervaltree/knownGene.txt.gz' # To speed up testing, we'll download the file and reuse the downloaded copy knownGene_file, headers = urlretrieve(knownGene_url) knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file) knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, decompress=True) # Py3 downloads .gz files to local files with names not ending with .gz assert len(knownGene) == 82960 result = knownGene[b'chr1'].search(100000, 138529) assert len(result) == 1 assert list(result)[0].data['name'] == b'uc021oeg.2' knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='cds', decompress=True) assert len(knownGene) == 82960 assert not knownGene[b'chr1'].overlaps(100000, 138529) knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='exons', decompress=True) assert len(knownGene) == 742493 result = list(knownGene[b'chr1'].search(134772, 140566)) assert len(result) == 3 assert result[0].data == result[1].data and result[0].data == result[2].data
def test_knownGene(base_url): # To speed up testing, we'll download the file and reuse the downloaded copy knownGene_url = base_url + 'knownGene.txt.gz' # To speed up testing, we'll download the file and reuse the downloaded copy knownGene_file, headers = urlretrieve(knownGene_url) knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file) knownGene = GenomeIntervalTree.from_table( url=knownGene_localurl, decompress=True ) # Py3 downloads .gz files to local files with names not ending with .gz assert len(knownGene) == 82960 result = knownGene[b'chr1'].search(100000, 138529) assert len(result) == 1 assert list(result)[0].data['name'] == b'uc021oeg.2' knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='cds', decompress=True) assert len(knownGene) == 82960 assert not knownGene[b'chr1'].overlaps(100000, 138529) knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='exons', decompress=True) assert len(knownGene) == 742493 result = list(knownGene[b'chr1'].search(134772, 140566)) assert len(result) == 3 assert result[0].data == result[1].data and result[0].data == result[2].data
def _test_promotorsearch(): # Realistic example: find a promotor of a given gene ('NANOG', for example) # It is slow, so you don't want to run it too much. from intervaltree_bio import GenomeIntervalTree, UCSCTable # Download refGene table refGene = GenomeIntervalTree.from_table( url= 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz', parser=UCSCTable.REF_GENE) # Find the NANOG gene nanog = [ i for chrom in refGene for i in refGene[chrom] if i.data['name2'] == 'NANOG' ] nanog = nanog[0] # Download genome segmentation table e = Encode() segments = e.AwgSegmentation.CombinedHepg2.fetch().read_as_intervaltree() # Find the segmentation of the NANOG transcript +- 10kb results = segments[nanog.data['chrom']].search(nanog.begin - 10000, nanog.end + 10000) # Leave the promotor/promotor flanking segments only results = [i for i in results if i.data[0] in ['PF', 'P']] print results
def create_gene_tree(bed_file_path): # dictionary mapping chromosome names to interval trees models = dict() #dmRNA = getmRNAlengths(gff3_file_path) tree = GenomeIntervalTree() # parse the annotations file (GFF3) and build the interval trees with open(bed_file_path, 'r') as annotations_file: reader = csv.reader(annotations_file, delimiter='\t') for row in reader: if len(row) == 9 and not row[0].startswith('##'): seqid = row[0] start = int(row[1]) end = int(row[2]) strand = row[3] m_id = row[4] g_id = row[5] cov = float(row[6]) idty = float(row[7]) matches = int(row[8]) #tree = None if tree[seqid].overlaps(start, end): continue else: models[m_id] = 1 models[g_id] = 1 tree[seqid].addi(start, end, data=({ "ID": m_id, "Parent": g_id })) return models
def test_ensGene(base_url): # Smoke-test we can at least read ensGene. ensGene_url = base_url + 'ensGene.txt.gz' ensGene = GenomeIntervalTree.from_table(url=ensGene_url, mode='cds', parser=UCSCTable.ENS_GENE) assert len(ensGene) == 204940
def test_genepred(): # Smoke-test for output from gtfToGenePred testdir = os.path.join(os.path.dirname(__file__), 'test_data') kg = open(os.path.abspath(os.path.join(testdir, "test_genepred.txt"))) gtree = GenomeIntervalTree.from_table(fileobj=kg, mode='cds', parser=UCSCTable.GENEPRED) assert len(gtree) == 100
def test_refGene(base_url): # Smoke-test for refGene refGene_url = base_url + 'refGene.txt.gz' refGene = GenomeIntervalTree.from_table(url=refGene_url, mode='tx', parser=UCSCTable.REF_GENE) assert len( refGene ) == 52350 # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same.
def test_pickling(): git = GenomeIntervalTree() git['a'][1:2] = ['some', 'data'] git['a'][1.5:2.5] = ['more', 'data'] git['b'][10:12] = ['even', 'more', 'data'] s = pickle.dumps(git) new_git = pickle.loads(s) assert len(git) == len(new_git) assert len(git['a']) == len(new_git['a'])
def gtf2tree(gtf_path): genepred_annot = os.path.splitext(gtf_path)[0] + ".genePred" ucsc_annot = os.path.splitext(gtf_path)[0] + ".UCSCTable.gz" gtf_to_genepred(gtf_path, genepred_annot) genepred_to_UCSCtable(genepred_annot, ucsc_annot) kg = gzip.open(ucsc_annot) gtree = GenomeIntervalTree.from_table(fileobj=kg, mode='tx', parser=UCSCTable.ENS_GENE) return gtree
def read_as_intervaltree(self): ''' Reads the data from a 'bed' file into an ``intervaltree_bio.GenomeIntervalTree`` data structure. Similarly to ``open`` and ``open_text`` it won't download file to cache, if it is not there. Reads the whole file to memory during its work. The file must be a `bed` or `bed.gz` file. The ``data`` field of each interval will contain the result of ``ln.split('\t')[3:]`` applied to the corresponding line of the ``bed`` file. Returns: a GenomeIntervalTree instance. ''' assert self['type'] in ['bed', 'narrowPeak', 'broadPeak'] with self.open_text() as f: gtree = GenomeIntervalTree.from_bed(fileobj=f) return gtree
def _test_promotorsearch(): # Realistic example: find a promotor of a given gene ('NANOG', for example) # It is slow, so you don't want to run it too much. from intervaltree_bio import GenomeIntervalTree, UCSCTable # Download refGene table refGene = GenomeIntervalTree.from_table(url='http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz', parser=UCSCTable.REF_GENE) # Find the NANOG gene nanog = [i for chrom in refGene for i in refGene[chrom] if i.data['name2'] == 'NANOG'] nanog = nanog[0] # Download genome segmentation table e = Encode() segments = e.AwgSegmentation.CombinedHepg2.fetch().read_as_intervaltree() # Find the segmentation of the NANOG transcript +- 10kb results = segments[nanog.data['chrom']].search(nanog.begin-10000, nanog.end+10000) # Leave the promotor/promotor flanking segments only results = [i for i in results if i.data[0] in ['PF', 'P']] print results
class AnnoTestCase(unittest.TestCase): """Tests annotation""" mygtf = os.path.realpath("test_data/test.gtf.gz") genepred_annot = os.path.splitext(mygtf)[0] + ".genePred" ucsc_annot = os.path.splitext(mygtf)[0] + ".UCSCTable.gz" su.gtf_convert.gtf_to_genepred(mygtf, genepred_annot) su.gtf_convert.genepred_to_UCSCtable(genepred_annot, ucsc_annot) su.gtf_convert.gtf_to_genepred(mygtf, genepred_annot) su.gtf_convert.genepred_to_UCSCtable(genepred_annot, ucsc_annot) kg = gzip.open(ucsc_annot) global gtree gtree = GenomeIntervalTree.from_table(fileobj=kg, mode='tx', parser=UCSCTable.ENS_GENE) def test_get_jxnside_anno_v1(self): """test get_jxnside_anno""" jxn_filt = pd.DataFrame({'name': 'chr1:871160:+:chr1:985950:-:0:0', 'jxn_reads': 'a1,a2,a3', 'jxn_counts': 3, 'spans': 5, 'spanreads': 's1,s2,s3,s4,s5', 'dist': 114790, 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'}, index=pd.Series(0)) jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1)) jxn_filt['right_symbol'], jxn_filt['right_annot'], jxn_filt['right_strand'], jxn_filt['right_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 2), axis=1)) assert(jxn_filt['left_symbol'].iloc[0] == 'SAMD11') assert(jxn_filt['right_symbol'].iloc[0] == 'AGRN') assert(jxn_filt['left_strand'].iloc[0] == '+') assert(jxn_filt['right_strand'].iloc[0] == '+') assert(jxn_filt['left_cdslen'].iloc[0] == 9852) assert(jxn_filt['right_cdslen'].iloc[0] == 3395) def test_get_jxnside_anno_v2(self): """test get_jxnside_anno.. Tests where no transcripts found""" jxn_filt = pd.DataFrame({'name': 'chr19:560462:+:chr8:560462:-:0:0', 'jxn_reads': 'a1,a2,a3', 'jxn_counts': 3, 'spans': 5, 'spanreads': 's1,s2,s3,s4,s5', 'dist': 13082084, 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'}, index=pd.Series(0)) jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen']= zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1)) assert(jxn_filt['left_symbol'].iloc[0] == 'NA') assert(jxn_filt['left_strand'].iloc[0] == 'NA') assert(jxn_filt['left_cdslen'].iloc[0] == 'NA') def test_get_jxnside_anno_v3(self): """test get_jxnside_anno.. Tests non-coding transcripts for sorting""" jxn_filt = pd.DataFrame({'name': 'chr12:15704606:+:chr1:11918527:-:2:1', 'jxn_reads': 'a1,a2,a3', 'jxn_counts': 3, 'spans': 5, 'spanreads': 's1,s2,s3,s4,s5', 'dist': 'NA', 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'}, index=pd.Series(0)) jxn_filt['left_symbol'], jxn_filt['left_annot'], jxn_filt['left_strand'], jxn_filt['left_cdslen'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxnside_anno(x['name'], gtree, 1), axis=1)) assert(jxn_filt['left_symbol'].iloc[0] == 'NA') assert(jxn_filt['left_strand'].iloc[0] == 'NA') assert(jxn_filt['left_cdslen'].iloc[0] == 'NA') def test_get_jxn_genes(self): """test_get_jxn_genes""" jxn_filt = pd.DataFrame({'name': 'chr1:871160:+:chr1:985950:-:0:0', 'jxn_reads': 'a1,a2,a3', 'jxn_counts': 3, 'spans': 5, 'spanreads': 's1,s2,s3,s4,s5', 'dist': 114790, 'ann_format': 'Symbol:Transcript:Strand:Exon_No:Dist_to_Exon:Frame:CDS_Length'}, index=pd.Series(0)) jxn_filt['left_all'], jxn_filt['right_all'] = zip(*jxn_filt.apply(lambda x: su.annotate_sv.get_jxn_genes(x['name'], gtree), axis=1)) assert(jxn_filt['left_all'].iloc[0] == ['SAMD11']) assert(jxn_filt['right_all'].loc[0] == ['AGRN']) def test_get_pos_genes(self): """test_get_pos_genes""" assert(su.annotate_sv.get_pos_genes('chr1', 871160, gtree) == ['SAMD11'])
def test_refGene(): # Smoke-test for refGene refGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz' refGene_url = 'http://kt.era.ee/distribute/pyintervaltree/refGene.txt.gz' refGene = GenomeIntervalTree.from_table(url=refGene_url, mode='tx', parser=UCSCTable.REF_GENE) assert len(refGene) == 52350 # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same.
def test_ensGene(): # Smoke-test we can at least read ensGene. ensGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/ensGene.txt.gz' ensGene_url = 'http://kt.era.ee/distribute/pyintervaltree/ensGene.txt.gz' ensGene = GenomeIntervalTree.from_table(url=ensGene_url, mode='cds', parser=UCSCTable.ENS_GENE) assert len(ensGene) == 204940
def bed_to_tree(bed): with open(bed, 'r') as f: btree = GenomeIntervalTree.from_bed(fileobj=f) return btree