def test_negative_strand(self): """ Whole read is in single transcript, single segment. But the segment borders on intergenic (downstream). """ gtf_neg_data = [ i[:6] + ['-'] + i[7:] for i in intervals_to_list(self.gtf_data) ] gtf_neg = make_file_from_list(gtf_neg_data) bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 16, 0, 549, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '20', '0.5', '0'], ['intergenic-CDS', '-80', '0.5', '0'], ] rnamaps.run(bam, gtf_neg, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_explicit_whole_in(self): """ Whole read is in single transcript and is crossing the exon-intron landmark (it is explicit). Provide three reads, with two different cross-links. One cross-link has two distinct randomers. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 140, 255, [(0, 50)], { 'NH': 1 }), ('name2:rbc:AAAA', 0, 0, 142, 255, [(0, 50)], { 'NH': 1 }), ('name2:rbc:CCCC', 0, 0, 142, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['UTR5-intron', '-10', '1', '1'], ['UTR5-intron', '-8', '2', '2'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_exons(self): """ Whole read is in single transcript and in single segment. Also, this segment is of EXON_TYPE in the "middle" segment in transcript. Only one read. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 205, 255, [(0, 20)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-UTR3', '-25', '0.25', '0'], ['CDS-intron', '-25', '0.25', '0'], ['UTR5-CDS', '5', '0.25', '0'], ['intron-CDS', '5', '0.25', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_intergenic(self): """ Whole read is in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 530, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '30', '0.5', '0'], ['intergenic-CDS', '-70', '0.5', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_whole_in(self): """ Whole read is in single transcript and in single segment. Also, this segment is the "middle" segment in transcript. Provide three reads, with two different cross-links. One cross-link has two distinct randomers. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 160, 255, [(0, 30)], { 'NH': 1 }), ('name2:rbc:CCCC', 0, 0, 163, 255, [(0, 30)], { 'NH': 1 }), ('name2:rbc:GGGG', 0, 0, 163, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['UTR5-intron', '10', '1', '0'], ['UTR5-intron', '13', '2', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_cross_transcript_read(self): """ Read is half in transcript region and half in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 235, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ [ 'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len' ], ['1', '+', '234', '0', '284', '50'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.cross_tr))
def test_explicit_intergenic_right(self): """ Read is half in transcript region and half in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 480, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '-20', '1', '1'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_inter_tr(self): """ Whole read is in single transcript, single segment. But the segment borders on intergenic (downstream). """ bam = make_bam_file( { 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 610, 255, [(0, 30)], { 'NH': 1 }), ] }, rnd_seed=0) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-CDS', '-40', '0.3333', '0'], ['CDS-intron', '-40', '0.3333', '0'], ['intergenic-CDS', '10', '0.3333', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_run(self): landmarks = make_file_from_list( sort=True, data=[ ['chr1', '210', '211', 'gene-start;A', '.', '+'], ['chr1', '270', '271', 'translation-start;A', '.', '+'], ['chr1', '299', '300', 'noncoding-gene-end;B', '.', '-'], ['chr1', '330', '331', 'exon-intron;A', '.', '+'], ['chr1', '490', '491', 'intron-exon;A', '.', '+'], ['chr1', '550', '551', 'translation-end;A', '.', '+'], ['chr1', '749', '750', 'noncoding-gene-start;B', '.', '-'], ['chr1', '760', '761', 'gene-end;A', '.', '+'], ]) sites = make_file_from_list([ ['chr1', '220', '221', '.', '1', '+'], ['chr1', '350', '351', '.', '1', '+'], ['chr1', '350', '351', '.', '1', '-'], ['chr1', '550', '551', '.', '1', '+'], ['chr1', '740', '741', '.', '1', '+'], ['chr1', '750', '751', '.', '1', '-'], ]) rnamaps.run(sites, landmarks, outdir=self.outdir) self.assertTrue(os.path.isdir(self.outdir)) sites_name = remove_extension(sites, ['.bed', '.bed.gz']) for maptype in rnamaps.RNAMAP_TYPES: basename = os.path.join(self.outdir, '{}_{}'.format(sites_name, maptype)) # for extension in ['.tsv', '.png', '_plot_data.txt']: for extension in ['.tsv', '.png']: fname = basename + extension self.assertTrue(os.path.isfile(fname)) self.assertGreater(os.path.getsize(fname), 1)
def test_run(self): regions = make_file_from_list( sort=True, data=[ [ 'chr1', '.', 'intergenic', '1', '210', '.', '+', '.', 'gene_name "None";' ], [ 'chr1', '.', 'UTR5', '211', '270', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'CDS', '271', '330', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intron', '331', '490', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'CDS', '491', '550', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'UTR3', '551', '760', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intergenic', '761', '1100', '.', '+', '.', 'gene_name "None";' ], [ 'chr1', '.', 'intergenic', '1', '300', '.', '-', '.', 'gene_name "None";' ], [ 'chr1', '.', 'ncRNA', '301', '500', '.', '-', '.', 'gene_name "B";' ], [ 'chr1', '.', 'intron', '501', '600', '.', '-', '.', 'gene_name "B";' ], [ 'chr1', '.', 'ncRNA', '601', '750', '.', '-', '.', 'gene_name "B";' ], [ 'chr1', '.', 'intergenic', '751', '1000', '.', '-', '.', 'gene_name "None";' ], ]) sites = make_file_from_list([ ['chr1', '220', '221', '.', '1', '+'], ['chr1', '350', '351', '.', '1', '+'], ['chr1', '350', '351', '.', '1', '-'], ['chr1', '550', '551', '.', '1', '+'], ['chr1', '740', '741', '.', '1', '+'], ['chr1', '750', '751', '.', '1', '-'], ]) rnamaps.run(sites, regions, outdir=self.outdir) self.assertTrue(os.path.isdir(self.outdir)) sites_name = remove_extension(sites, ['.bed', '.bed.gz']) for maptype in rnamaps.MAP_TYPES: basename = os.path.join(self.outdir, '{}_{}'.format(sites_name, maptype)) # for extension in ['.tsv', '.png', '_plot_data.txt']: for extension in ['.tsv', '.png']: fname = basename + extension self.assertTrue(os.path.isfile(fname)) self.assertGreater(os.path.getsize(fname), 1)