def test_clusters(self): fin_sites = make_file_from_list([ ['1', '1', '2', '.', '1', '+'], ['1', '2', '3', '.', '1', '+'], ['1', '3', '4', '.', '1', '+'], ['1', '4', '5', '.', '2', '+'], ['1', '4', '5', '.', '1', '-'], ['1', '5', '6', '.', '1', '+'], ['1', '6', '7', '.', '1', '-'], ['1', '7', '8', '.', '1', '-'], ['1', '10', '11', '.', '1', '+'], ['1', '11', '12', '.', '2', '+'], ['1', '12', '13', '.', '1', '+'], ]) fin_peaks = make_file_from_list([ ['1', '4', '5', 'cl1', '1', '+'], ['1', '4', '5', 'cl2', '1', '-'], ['1', '5', '6', 'cl3', '1', '+'], ['1', '11', '12', 'cl4', '2', '+'], ]) fout_clusters = get_temp_file_name() clusters.run(fin_sites, fin_peaks, fout_clusters, dist=3, slop=2) result = make_list_from_file(fout_clusters, fields_separator='\t') expected = [ ['1', '2', '6', 'cl1,cl3', '5', '+'], ['1', '4', '7', 'cl2', '2', '-'], ['1', '10', '13', 'cl4', '4', '+'], ] self.assertEqual(expected, result)
def template(cross_links, annotation, subtype='biotype', excluded_types=None): """ Utility function for testing iCount.analysis.annotate Instead of input files, accept the file content in form of lists and create temporary files from them on the fly. This avoids the problem of having a bunch of multiple small files or one large file (which would violate the idea of test isolation). For example of how to use this function check any test that uses it. Parameters ---------- cross_links : list List representation of cross-links file. annotation : list List representation of annotation file. Returns ------- list List representation of output file of analysis.annotate(). """ cross_links_file = make_file_from_list(cross_links, extension='bed.gz') annotation_file = make_file_from_list(annotation, extension='gtf.gz') out_file = get_temp_file_name(extension='bed.gz') annotate.annotate_cross_links(annotation_file, cross_links_file, out_file, subtype=subtype, excluded_types=excluded_types) return make_list_from_file(out_file, fields_separator='\t')
def test_rnamaps(self): regions = make_file_from_list([ ['1', '.', 'intergenic', '1', '100', '.', '+', '.', ''], ['1', '.', 'intergenic', '1', '610', '.', '-', '.', ''], ['1', '.', 'UTR5', '101', '160', '.', '+', '.', ''], ['1', '.', 'intron', '161', '320', '.', '+', '.', ''], ['1', '.', 'CDS', '321', '380', '.', '+', '.', ''], ['1', '.', 'intron', '381', '540', '.', '+', '.', ''], ['1', '.', 'UTR3', '541', '600', '.', '+', '.', ''], ['1', '.', 'intergenic', '601', '1000', '.', '+', '.', ''], ['1', '.', 'ncRNA', '611', '700', '.', '-', '.', ''], ['1', '.', 'intron', '701', '800', '.', '-', '.', ''], ['1', '.', 'ncRNA', '801', '900', '.', '-', '.', ''], ['1', '.', 'intergenic', '901', '1000', '.', '-', '.', ''], ]) cross_links = make_file_from_list([ ['1', '120', '121', '.', '1', '+'], ['1', '350', '351', '.', '1', '+'], ['1', '550', '551', '.', '1', '+'], ['1', '750', '751', '.', '1', '-'], ]) command_basic = [ 'iCount', 'rnamaps', cross_links, regions, '-S', '40', # Supress lower than ERROR messages. ] self.assertEqual(subprocess.call(command_basic), 0)
def test_limits_downstream(self): """Landmarks with too short upstream segment should not be used.""" regions = make_file_from_list([ [ 'chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intron', '201', '350', '.', '+', '.', 'gene_name "A";' ], ]) fn = rnamaps.make_landmarks_file(regions, 'exon-intron') self.assertEqual(make_list_from_file(fn), []) regions = make_file_from_list([ [ 'chr1', '.', 'CDS', '151', '200', '.', '-', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";' ], ]) fn = rnamaps.make_landmarks_file(regions, 'intron-exon') self.assertEqual(make_list_from_file(fn), [])
def test_basic(self): regions = make_file_from_list([ [ 'chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intron', '201', '351', '.', '+', '.', 'gene_name "A";' ], ]) fn = rnamaps.make_landmarks_file(regions, 'exon-intron') self.assertEqual(make_list_from_file(fn), [ ['chr1', '200', '201', 'A', '.', '+'], ]) regions = make_file_from_list([ [ 'chr1', '.', 'CDS', '150', '200', '.', '-', '.', 'gene_name "A";' ], [ 'chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";' ], ]) fn = rnamaps.make_landmarks_file(regions, 'intron-exon') self.assertEqual(make_list_from_file(fn), [ ['chr1', '199', '200', 'A', '.', '-'], ])
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) # Temporary file names to use for output: self.tmp1 = get_temp_file_name() self.tmp2 = get_temp_file_name() self.dir = get_temp_dir() self.dir2 = get_temp_dir() self.cross_links = make_file_from_list([ ['1', '16', '17', '.', '5', '+'], ['1', '14', '15', '.', '5', '+'], ['1', '15', '16', '.', '5', '+'], ], extension='bed') self.peaks = make_file_from_list([ ['1', '15', '16', '.', '15', '+'], ]) self.annotation = make_file_from_list([ ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'ncRNA', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "B";'], ['1', '.', 'CDS', '10', '20', '.', '-', '.', 'biotype "C";'], ['1', '.', 'CDS', '12', '18', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '30', '40', '.', '+', '.', 'biotype "D";'], ]) self.gtf = make_file_from_list([ ['1', '.', 'gene', '10', '20', '.', '+', '.', 'gene_id "A";'], [ '1', '.', 'transcript', '10', '20', '.', '+', '.', 'gene_id "A"; transcript_id "AA";' ], [ '1', '.', 'exon', '10', '20', '.', '+', '.', 'gene_id "A"; transcript_id "AA"; exon_number "1";' ], ]) self.bam = make_bam_file( { 'chromosomes': [ ('1', 3000), ('2', 2000), ], 'segments': [ ('name3:rbc:CCCC:', 0, 0, 100, 20, [(0, 100)], { 'NH': 1 }), ('name4:ABC', 0, 0, 300, 20, [(0, 200)], { 'NH': 11 }), ] }, rnd_seed=0)
def test_all_good(self): gtf_in_data = list_to_intervals([ ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ]) gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data)) gtf_out = tempfile.NamedTemporaryFile(mode='w+', delete=False) gtf_out.close() genome_file = make_file_from_list( [ ['1', '2000'], ['MT', '500'], ], bedtool=False) gtf_out_data = list_to_intervals(make_list_from_file(segment.get_regions( gtf_in_file, gtf_out.name, genome_file), fields_separator='\t')) expected = list_to_intervals([ ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR5', '400', '409', '.', '+', '.', 'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'], ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR3', '491', '500', '.', '+', '.', 'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ]) self.assertEqual(expected, gtf_out_data)
def get_summary_reports(self, annotation, cross_links): """Help running tests for ``summary_report`` with less clutter.""" annotation_file = make_file_from_list(annotation) cross_links_file = make_file_from_list(cross_links) segment.summary_templates(annotation_file, self.out_dir) summary.summary_reports(annotation_file, cross_links_file, self.out_dir, self.out_dir) return [ make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_TYPE), '\t'), make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_SUBTYPE), '\t'), make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_GENE), '\t'), ]
def test_complement(self): genome_file = make_file_from_list( [ ['1', '2000'], ['2', '1000'], ['MT', '500'], ], bedtool=False) genes = list_to_intervals([ ['1', '.', 'gene1', '200', '400', '.', '+', '.', '.'], ['1', '.', 'gene2', '300', '600', '.', '+', '.', '.'], ['1', '.', 'gene3', '200', '500', '.', '+', '.', '.'], ['2', '.', 'gene4', '100', '200', '.', '+', '.', '.'], ['2', '.', 'gene5', '100', '300', '.', '-', '.', '.'], ]) complement = make_list_from_file(segment._complement(genes, genome_file, '+'), fields_separator='\t') empty_col8 = 'ID "inter%s"; gene_id "."; transcript_id ".";' expected = [ ['1', '.', 'intergenic', '1', '199', '.', '+', '.', empty_col8 % "P00000"], ['1', '.', 'intergenic', '601', '2000', '.', '+', '.', empty_col8 % "P00001"], ['2', '.', 'intergenic', '1', '99', '.', '+', '.', empty_col8 % "P00002"], ['2', '.', 'intergenic', '201', '1000', '.', '+', '.', empty_col8 % "P00003"], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', empty_col8 % "P00004"], ] self.assertEqual(complement, expected)
def test_negative_strand(self): """ Whole read is in single transcript, single segment. But the segment borders on intergenic (downstream). """ gtf_neg_data = [ i[:6] + ['-'] + i[7:] for i in intervals_to_list(self.gtf_data) ] gtf_neg = make_file_from_list(gtf_neg_data) bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 16, 0, 549, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '20', '0.5', '0'], ['intergenic-CDS', '-80', '0.5', '0'], ] rnamaps.run(bam, gtf_neg, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_segment(self): fai = make_file_from_list([ ['1', '2000'], ['MT', '500'], ], bedtool=False) command_basic = [ 'iCount', 'segment', self.gtf, self.tmp1, fai, '-S', '40', # Supress lower than ERROR messages. ] command_full = [ 'iCount', 'segment', self.gtf, self.tmp1, fai, '--report_progress', '-S', '40', # Supress lower than ERROR messages. ] self.assertEqual(subprocess.call(command_basic), 0) self.assertEqual(subprocess.call(command_full), 0)
def test_run(self): fin_annotation = make_file_from_list([ [ '1', '.', 'gene', '10', '20', '.', '+', '.', 'gene_name "A"; gene_id "1";' ], [ '1', '.', 'transcript', '10', '20', '.', '+', '.', 'gene_name "B"; gene_id "1";' ], [ '2', '.', 'CDS', '10', '20', '.', '+', '.', 'gene_name "C"; gene_id "1";' ], ]) fin_sites = make_file_from_list([ ['1', '14', '15', '.', '3', '+'], ['1', '16', '17', '.', '5', '+'], ['2', '16', '17', '.', '5', '+'], ]) fout_peaks = get_temp_file_name(extension='.bed.gz') fout_scores = get_temp_file_name(extension='.tsv.gz') peaks.run(fin_annotation, fin_sites, fout_peaks, scores=fout_scores) out_peaks = make_list_from_file(fout_peaks, fields_separator='\t') out_scores = make_list_from_file(fout_scores, fields_separator='\t') # Remove header: out_scores = out_scores[1:] expected_peaks = [ ['1', '14', '15', 'A-1', '3', '+'], ['1', '16', '17', 'A-1', '5', '+'], ] expected_scores = [ ['1', '14', '+', 'A', '1', '3', '8', '0.036198'], ['1', '16', '+', 'A', '1', '5', '8', '0.036198'], [ '2', '16', '+', 'not_annotated', 'not_annotated', '5', 'not_calculated', '1' ], ] self.assertEqual(out_peaks, expected_peaks) self.assertEqual(out_scores, expected_scores)
def test_limits_downstream(self): """Landmarks with too short upstream segment should not be used.""" regions = make_file_from_list([ ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'], ['chr1', '.', 'intron', '201', '350', '.', '+', '.', 'gene_name "A";'], ]) landmarks = landmark.make_single_type_landmarks(regions, 'exon-intron') landmarks = [list(map(str, item)) for item in landmarks] self.assertEqual(landmarks, []) regions = make_file_from_list([ ['chr1', '.', 'CDS', '151', '200', '.', '-', '.', 'gene_name "A";'], ['chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";'], ]) landmarks = landmark.make_single_type_landmarks(regions, 'intron-exon') landmarks = [list(map(str, item)) for item in landmarks] self.assertEqual(landmarks, [])
def test_no_landmark(self): """Landmark is missing on this chromosome / stramd.""" xlinks = make_file_from_list([ ['chrX', '22', '23', '.', '3', '+'], ]) distances, total_cdna = rnamaps.compute_distances( self.landmarks, xlinks, 'exon-intron') self.assertEqual(total_cdna, 3) self.assertEqual(distances, {})
def test_basic(self): segmentation = [ # Transcript #1 [ '1', '.', 'ncRNA', '1', '10', '.', '+', '.', 'biotype "A"; gene_name "X";' ], [ '1', '.', 'intron', '11', '20', '.', '+', '.', 'biotype "A"; gene_name "X";' ], [ '1', '.', 'CDS', '21', '30', '.', '+', '.', 'biotype "A"; gene_name "X";' ], [ '1', '.', 'UTR3', '31', '40', '.', '+', '.', 'biotype "A"; gene_name "X";' ], # Transcript #1 [ '1', '.', 'CDS', '5', '14', '.', '+', '.', 'biotype "A"; gene_name "X";' ], [ '1', '.', 'intron', '15', '24', '.', '+', '.', 'biotype "A"; gene_name "X";' ], [ '1', '.', 'CDS', '25', '34', '.', '+', '.', 'biotype "A"; gene_name "X";' ], # Also negative strand: [ '1', '.', 'CDS', '3', '32', '.', '-', '.', 'biotype "A"; gene_name "X";' ], ] expected = [ ['1', '0', '4', '.', '.', '+'], ['1', '4', '10', '.', '.', '+'], ['1', '10', '14', '.', '.', '+'], ['1', '14', '20', '.', '.', '+'], ['1', '20', '24', '.', '.', '+'], ['1', '24', '30', '.', '.', '+'], ['1', '30', '34', '.', '.', '+'], ['1', '34', '40', '.', '.', '+'], ['1', '2', '32', '.', '.', '-'], ] segmentation_file = make_file_from_list(segmentation) borders_file = region.construct_borders(BedTool(segmentation_file)) results = make_list_from_file(borders_file, fields_separator='\t') self.assertEqual( expected, # Sort results by chrom, strand, start, stop sorted(results, key=lambda x: (x[0], x[-1], int(x[1]), int(x[2]))))
def _make_types_length(annotation, subtype='biotype', excluded_types=None): """ Run function `make_types_length_file` with data from `annotation`. """ annotation_file = make_file_from_list(annotation) out_file = get_temp_file_name() fai = make_file_from_list(bedtool=False, data=[ ['1', '100'], ['6', '100'], ['20', '100'], ]) result, _ = summary.make_types_length_file(annotation_file, fai, out_file, subtype=subtype, excluded_types=excluded_types) return make_list_from_file(result, fields_separator='\t')
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) self.landmarks = make_file_from_list([ ['chr1', '10', '11', 'G1', '.', '+'], ['chr1', '20', '21', 'G1', '.', '+'], ['chr1', '20', '21', 'G2', '.', '-'], ['chr2', '10', '11', 'G3', '.', '+'], ])
def merge_bed_wrapper(data): """ TODO """ files = [] for file_ in data: files.append(make_file_from_list(file_)) out_file = get_temp_file_name() merge_bed(out_file, files) return make_list_from_file(out_file, fields_separator='\t')
def test_all_good(self): gtf_in_data = list_to_intervals([ ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ]) gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data)) gtf_out = get_temp_file_name() genome_file = make_file_from_list([ ['1', '2000'], ['MT', '500'], ], bedtool=False) segment.get_segments(gtf_in_file, gtf_out, genome_file) gtf_out_data = list_to_intervals(make_list_from_file(gtf_out, fields_separator='\t')) expected = list_to_intervals([ ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR5', '400', '409', '.', '+', '.', 'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'], ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR3', '491', '500', '.', '+', '.', 'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ]) self.assertEqual(expected, gtf_out_data) out_dir = os.path.dirname(os.path.abspath(gtf_out)) self.assertTrue(os.path.isfile(os.path.join(out_dir, region.REGIONS_FILE))) self.assertTrue(os.path.isfile(os.path.join(out_dir, 'landmarks.bed.gz')))
def merge_bed_wrapper(data): """ TODO """ files = [] for file_ in data: files.append(make_file_from_list(file_)) out_file = tempfile.NamedTemporaryFile(delete=False).name return make_list_from_file(merge_bed(out_file, files), fields_separator='\t')
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) self.gtf_data = list_to_intervals([ ['1', '.', 'intergenic', '1', '99', '.', '+', '.', 'gene_id "."; transcript_id ".";'], # Gene #1: ['1', '.', 'gene', '100', '499', '.', '+', '.', 'gene_id "G1";'], # Transcript #1 ['1', '.', 'transcript', '100', '249', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'UTR5', '100', '149', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "1";'], ['1', '.', 'intron', '150', '199', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'CDS', '200', '229', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";'], ['1', '.', 'intron', '230', '239', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'UTR3', '240', '249', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "3";'], # Transcript #2 ['1', '.', 'transcript', '240', '499', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'CDS', '240', '299', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";'], ['1', '.', 'intron', '300', '399', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'CDS', '400', '499', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";'], # intergenic ['1', '.', 'intergenic', '500', '599', '.', '+', '.', 'gene_id "."; transcript_id ".";'], # Gene #1: ['1', '.', 'gene', '600', '999', '.', '+', '.', 'gene_id "G2";'], # Transcript #3 ['1', '.', 'transcript', '600', '799', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'CDS', '600', '649', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1";'], ['1', '.', 'intron', '650', '749', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'CDS', '750', '799', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2";'], ]) self.gtf = make_file_from_list(intervals_to_list(self.gtf_data)) self.strange = get_temp_file_name() self.cross_tr = get_temp_file_name() self.out = get_temp_file_name()
def test_no_required_attributes(self): """ Raise error if transcript_id attribute is not present. """ gtf = make_file_from_list([ ['1', '.', 'transcript', '500', '600', '.', '+', '.', 'gene_id "G1";'], ]) message = "First element in gene content is neither gene or transcript!" with self.assertRaisesRegex(Exception, message): list((segment._get_gene_content(gtf, ['1', 'MT'])))
def test_basic(self): regions = make_file_from_list([ ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'], ['chr1', '.', 'intron', '201', '351', '.', '+', '.', 'gene_name "A";'], ]) landmarks = landmark.make_single_type_landmarks(regions, 'exon-intron') landmarks = [list(map(str, item)) for item in landmarks] self.assertEqual(landmarks, [ ['chr1', '200', '201', 'exon-intron;A', '.', '+'], ]) regions = make_file_from_list([ ['chr1', '.', 'CDS', '150', '200', '.', '-', '.', 'gene_name "A";'], ['chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";'], ]) landmarks = landmark.make_single_type_landmarks(regions, 'intron-exon') landmarks = [list(map(str, item)) for item in landmarks] self.assertEqual(landmarks, [ ['chr1', '199', '200', 'intron-exon;A', '.', '-'], ])
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) bed_data = [ ['1', '4', '5', '.', '5', '+'], ['1', '5', '6', '.', '1', '+'], ['1', '5', '6', '.', '1', '-'], ['2', '5', '6', '.', '3', '+'], ] self.bed = make_file_from_list(bed_data, extension='bed') self.bedgraph = get_temp_file_name(extension='bedgraph')
def _make_summary_report(annotation, cross_links, chrom_lengths, subtype='biotype', excluded_types=None): """ Run function `make_summary_report` with input/output data as lists. """ annotation_file = make_file_from_list(annotation) cross_links_file = make_file_from_list(cross_links) chrom_length_file = make_file_from_list(chrom_lengths, bedtool=False) out_file = tempfile.NamedTemporaryFile(delete=False).name summary.make_summary_report(annotation_file, cross_links_file, out_file, chrom_length_file, subtype=subtype, excluded_types=excluded_types) return make_list_from_file(out_file, fields_separator='\t')
def test_basic(self): xlinks = make_file_from_list([ ['chr1', '12', '13', '.', '3', '+'], ]) distances, total_cdna = rnamaps.compute_distances( self.landmarks, xlinks, 'exon-intron') self.assertEqual(total_cdna, 3) self.assertEqual(distances, { 'chr1__+__10__G1': { 2: 3 }, })
def test_basic(self): # seg is compositon of BED6 and GTF interval: nonmerged = make_file_from_list([ [ '1', '.', 'UTR3', '1', '10', '.', '+', '.', 'biotype "lncRNA";gene_id "id1";' ], [ '1', '.', 'UTR3', '11', '20', '.', '+', '.', 'biotype "lncRNA";gene_id "id1";' ], [ '1', '.', 'UTR3', '21', '30', '.', '+', '.', 'biotype "lncRNA";gene_id "id2";' ], [ '1', '.', 'UTR3', '31', '40', '.', '+', '.', 'biotype "lncRNA";gene_id "id1";' ], [ '1', '.', 'UTR3', '31', '40', '.', '-', '.', 'biotype "lncRNA";gene_id "id1";' ], ]) expected = [ [ '1', '.', 'UTR3', '1', '20', '.', '+', '.', 'biotype "lncRNA";gene_id "id1";' ], [ '1', '.', 'UTR3', '21', '30', '.', '+', '.', 'biotype "lncRNA";gene_id "id2";' ], [ '1', '.', 'UTR3', '31', '40', '.', '+', '.', 'biotype "lncRNA";gene_id "id1";' ], [ '1', '.', 'UTR3', '31', '40', '.', '-', '.', 'biotype "lncRNA";gene_id "id1";' ], ] region.merge_regions(nonmerged, self.tmp) results = make_list_from_file(self.tmp, fields_separator='\t') # Since order of attrs can be arbitrary, equality checks are more complex: for res, exp in zip(results, expected): self.assertEqual(res[:8], exp[:8]) self.assertEqual( ';'.join(sorted(res[8].split(';'))), ';'.join(sorted(exp[8].split(';'))), )
def test_all_good(self): """ * second gene has no 'gene' interval - but it is present in output as it should * last interval is on chromosome 2, but it is not in the output """ gtf_data = list_to_intervals([ ['1', '.', 'gene', '100', '300', '.', '+', '.', 'gene_id "G1";'], ['1', '.', 'transcript', '100', '250', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'exon', '100', '150', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "1";'], ['1', '.', 'exon', '200', '250', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";'], ['1', '.', 'transcript', '150', '300', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'exon', '150', '200', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";'], ['1', '.', 'exon', '250', '300', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['2', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G3"; transcript_id "T4";'], ]) gtf = make_file_from_list(intervals_to_list(gtf_data)) gene1, gene2 = list(segment._get_gene_content(gtf, ['1', 'MT'], report_progress=True)) expected1 = { 'gene': gtf_data[0], 'T1': gtf_data[1:4], 'T2': gtf_data[4:7], } extra_gene = create_interval_from_list( ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";']) expected2 = { 'gene': extra_gene, 'T3': gtf_data[7:-1], } self.assertEqual(gene1, expected1) self.assertEqual(gene2, expected2)
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) self.gtf_data = list_to_intervals([ [ '1', '.', 'intergenic', '1', '2', '.', '+', '.', 'gene_id "."; transcript_id ".";' ], # Gene #1: ['1', '.', 'gene', '3', '7', '.', '+', '.', 'gene_id "G1";'], # Transcript #1 [ '1', '.', 'transcript', '3', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";' ], [ '1', '.', 'CDS', '3', '3', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";' ], [ '1', '.', 'intron', '4', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";' ], [ '1', '.', 'UTR3', '5', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "3";' ], # Transcript #2 [ '1', '.', 'transcript', '4', '7', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";' ], [ '1', '.', 'ncRNA', '4', '5', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";' ], [ '1', '.', 'intron', '6', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";' ], [ '1', '.', 'ncRNA', '7', '7', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";' ], # intergenic [ '1', '.', 'intergenic', '8', '9', '.', '+', '.', 'gene_id "."; transcript_id ".";' ], ]) self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
def test_basic(self): regions = make_file_from_list([ ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'], ['chr1', '.', 'intron', '201', '400', '.', '+', '.', 'gene_name "A";'], ['chr1', '.', 'CDS', '401', '600', '.', '+', '.', 'gene_name "A";'], ]) landmarks = get_temp_file_name(extension='bed') landmark.make_landmarks(regions, landmarks) self.assertEqual(make_list_from_file(landmarks), [ ['chr1', '200', '201', 'exon-intron;A', '.', '+'], ['chr1', '400', '401', 'intron-exon;A', '.', '+'], ])