示例#1
0
    def test_clusters(self):
        fin_sites = make_file_from_list([
            ['1', '1', '2', '.', '1', '+'],
            ['1', '2', '3', '.', '1', '+'],
            ['1', '3', '4', '.', '1', '+'],
            ['1', '4', '5', '.', '2', '+'],
            ['1', '4', '5', '.', '1', '-'],
            ['1', '5', '6', '.', '1', '+'],
            ['1', '6', '7', '.', '1', '-'],
            ['1', '7', '8', '.', '1', '-'],
            ['1', '10', '11', '.', '1', '+'],
            ['1', '11', '12', '.', '2', '+'],
            ['1', '12', '13', '.', '1', '+'],
        ])

        fin_peaks = make_file_from_list([
            ['1', '4', '5', 'cl1', '1', '+'],
            ['1', '4', '5', 'cl2', '1', '-'],
            ['1', '5', '6', 'cl3', '1', '+'],
            ['1', '11', '12', 'cl4', '2', '+'],
        ])

        fout_clusters = get_temp_file_name()

        clusters.run(fin_sites, fin_peaks, fout_clusters, dist=3, slop=2)
        result = make_list_from_file(fout_clusters, fields_separator='\t')

        expected = [
            ['1', '2', '6', 'cl1,cl3', '5', '+'],
            ['1', '4', '7', 'cl2', '2', '-'],
            ['1', '10', '13', 'cl4', '4', '+'],
        ]

        self.assertEqual(expected, result)
示例#2
0
def template(cross_links, annotation, subtype='biotype',
             excluded_types=None):
    """
    Utility function for testing iCount.analysis.annotate

    Instead of input files, accept the file content in form of lists and create
    temporary files from them on the fly. This avoids the problem of having a
    bunch of multiple small files or one large file (which would violate the
    idea of test isolation).

    For example of how to use this function check any test that uses it.

    Parameters
    ----------
    cross_links : list
        List representation of cross-links file.
    annotation : list
        List representation of annotation file.

    Returns
    -------
    list
        List representation of output file of analysis.annotate().

    """
    cross_links_file = make_file_from_list(cross_links, extension='bed.gz')
    annotation_file = make_file_from_list(annotation, extension='gtf.gz')
    out_file = get_temp_file_name(extension='bed.gz')
    annotate.annotate_cross_links(annotation_file, cross_links_file, out_file, subtype=subtype,
                                  excluded_types=excluded_types)
    return make_list_from_file(out_file, fields_separator='\t')
示例#3
0
    def test_rnamaps(self):
        regions = make_file_from_list([
            ['1', '.', 'intergenic', '1', '100', '.', '+', '.', ''],
            ['1', '.', 'intergenic', '1', '610', '.', '-', '.', ''],
            ['1', '.', 'UTR5', '101', '160', '.', '+', '.', ''],
            ['1', '.', 'intron', '161', '320', '.', '+', '.', ''],
            ['1', '.', 'CDS', '321', '380', '.', '+', '.', ''],
            ['1', '.', 'intron', '381', '540', '.', '+', '.', ''],
            ['1', '.', 'UTR3', '541', '600', '.', '+', '.', ''],
            ['1', '.', 'intergenic', '601', '1000', '.', '+', '.', ''],
            ['1', '.', 'ncRNA', '611', '700', '.', '-', '.', ''],
            ['1', '.', 'intron', '701', '800', '.', '-', '.', ''],
            ['1', '.', 'ncRNA', '801', '900', '.', '-', '.', ''],
            ['1', '.', 'intergenic', '901', '1000', '.', '-', '.', ''],
        ])

        cross_links = make_file_from_list([
            ['1', '120', '121', '.', '1', '+'],
            ['1', '350', '351', '.', '1', '+'],
            ['1', '550', '551', '.', '1', '+'],
            ['1', '750', '751', '.', '1', '-'],
        ])

        command_basic = [
            'iCount',
            'rnamaps',
            cross_links,
            regions,
            '-S',
            '40',  # Supress lower than ERROR messages.
        ]

        self.assertEqual(subprocess.call(command_basic), 0)
示例#4
0
    def test_limits_downstream(self):
        """Landmarks with too short upstream segment should not be used."""
        regions = make_file_from_list([
            [
                'chr1', '.', 'CDS', '150', '200', '.', '+', '.',
                'gene_name "A";'
            ],
            [
                'chr1', '.', 'intron', '201', '350', '.', '+', '.',
                'gene_name "A";'
            ],
        ])
        fn = rnamaps.make_landmarks_file(regions, 'exon-intron')
        self.assertEqual(make_list_from_file(fn), [])

        regions = make_file_from_list([
            [
                'chr1', '.', 'CDS', '151', '200', '.', '-', '.',
                'gene_name "A";'
            ],
            [
                'chr1', '.', 'intron', '201', '351', '.', '-', '.',
                'gene_name "A";'
            ],
        ])
        fn = rnamaps.make_landmarks_file(regions, 'intron-exon')
        self.assertEqual(make_list_from_file(fn), [])
示例#5
0
    def test_basic(self):
        regions = make_file_from_list([
            [
                'chr1', '.', 'CDS', '150', '200', '.', '+', '.',
                'gene_name "A";'
            ],
            [
                'chr1', '.', 'intron', '201', '351', '.', '+', '.',
                'gene_name "A";'
            ],
        ])
        fn = rnamaps.make_landmarks_file(regions, 'exon-intron')
        self.assertEqual(make_list_from_file(fn), [
            ['chr1', '200', '201', 'A', '.', '+'],
        ])

        regions = make_file_from_list([
            [
                'chr1', '.', 'CDS', '150', '200', '.', '-', '.',
                'gene_name "A";'
            ],
            [
                'chr1', '.', 'intron', '201', '351', '.', '-', '.',
                'gene_name "A";'
            ],
        ])
        fn = rnamaps.make_landmarks_file(regions, 'intron-exon')
        self.assertEqual(make_list_from_file(fn), [
            ['chr1', '199', '200', 'A', '.', '-'],
        ])
示例#6
0
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)

        # Temporary file names to use for output:
        self.tmp1 = get_temp_file_name()
        self.tmp2 = get_temp_file_name()
        self.dir = get_temp_dir()
        self.dir2 = get_temp_dir()

        self.cross_links = make_file_from_list([
            ['1', '16', '17', '.', '5', '+'],
            ['1', '14', '15', '.', '5', '+'],
            ['1', '15', '16', '.', '5', '+'],
        ],
                                               extension='bed')

        self.peaks = make_file_from_list([
            ['1', '15', '16', '.', '15', '+'],
        ])

        self.annotation = make_file_from_list([
            ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'],
            ['1', '.', 'ncRNA', '10', '20', '.', '+', '.', 'biotype "A";'],
            ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'],
            ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "B";'],
            ['1', '.', 'CDS', '10', '20', '.', '-', '.', 'biotype "C";'],
            ['1', '.', 'CDS', '12', '18', '.', '+', '.', 'biotype "A";'],
            ['1', '.', 'CDS', '30', '40', '.', '+', '.', 'biotype "D";'],
        ])

        self.gtf = make_file_from_list([
            ['1', '.', 'gene', '10', '20', '.', '+', '.', 'gene_id "A";'],
            [
                '1', '.', 'transcript', '10', '20', '.', '+', '.',
                'gene_id "A"; transcript_id "AA";'
            ],
            [
                '1', '.', 'exon', '10', '20', '.', '+', '.',
                'gene_id "A"; transcript_id "AA"; exon_number "1";'
            ],
        ])

        self.bam = make_bam_file(
            {
                'chromosomes': [
                    ('1', 3000),
                    ('2', 2000),
                ],
                'segments': [
                    ('name3:rbc:CCCC:', 0, 0, 100, 20, [(0, 100)], {
                        'NH': 1
                    }),
                    ('name4:ABC', 0, 0, 300, 20, [(0, 200)], {
                        'NH': 11
                    }),
                ]
            },
            rnd_seed=0)
示例#7
0
    def test_all_good(self):
        gtf_in_data = list_to_intervals([
            ['1', '.', 'gene', '400', '500', '.', '+', '.',
             'gene_id "G2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
        ])
        gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data))

        gtf_out = tempfile.NamedTemporaryFile(mode='w+', delete=False)
        gtf_out.close()

        genome_file = make_file_from_list(
            [
                ['1', '2000'],
                ['MT', '500'],
            ], bedtool=False)

        gtf_out_data = list_to_intervals(make_list_from_file(segment.get_regions(
            gtf_in_file, gtf_out.name, genome_file), fields_separator='\t'))

        expected = list_to_intervals([
            ['1', '.', 'intergenic', '1', '399', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['1', '.', 'intergenic', '1', '2000', '.', '-', '.',
             'gene_id "."; transcript_id ".";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR5', '400', '409', '.', '+', '.',
             'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'gene', '400', '500', '.', '+', '.',
             'gene_id "G2"; biotype "[.]";'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intron', '431', '469', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; biotype ".";'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR3', '491', '500', '.', '+', '.',
             'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intergenic', '501', '2000', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '-', '.',
             'gene_id "."; transcript_id ".";'],
        ])

        self.assertEqual(expected, gtf_out_data)
示例#8
0
    def get_summary_reports(self, annotation, cross_links):
        """Help running tests for ``summary_report`` with less clutter."""
        annotation_file = make_file_from_list(annotation)
        cross_links_file = make_file_from_list(cross_links)

        segment.summary_templates(annotation_file, self.out_dir)
        summary.summary_reports(annotation_file, cross_links_file, self.out_dir, self.out_dir)
        return [
            make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_TYPE), '\t'),
            make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_SUBTYPE), '\t'),
            make_list_from_file(os.path.join(self.out_dir, segment.SUMMARY_GENE), '\t'),
        ]
示例#9
0
    def test_complement(self):

        genome_file = make_file_from_list(
            [
                ['1', '2000'],
                ['2', '1000'],
                ['MT', '500'],
            ], bedtool=False)

        genes = list_to_intervals([
            ['1', '.', 'gene1', '200', '400', '.', '+', '.', '.'],
            ['1', '.', 'gene2', '300', '600', '.', '+', '.', '.'],
            ['1', '.', 'gene3', '200', '500', '.', '+', '.', '.'],
            ['2', '.', 'gene4', '100', '200', '.', '+', '.', '.'],
            ['2', '.', 'gene5', '100', '300', '.', '-', '.', '.'],
        ])

        complement = make_list_from_file(segment._complement(genes, genome_file, '+'), fields_separator='\t')

        empty_col8 = 'ID "inter%s"; gene_id "."; transcript_id ".";'
        expected = [
            ['1', '.', 'intergenic', '1', '199', '.', '+', '.', empty_col8 % "P00000"],
            ['1', '.', 'intergenic', '601', '2000', '.', '+', '.', empty_col8 % "P00001"],
            ['2', '.', 'intergenic', '1', '99', '.', '+', '.', empty_col8 % "P00002"],
            ['2', '.', 'intergenic', '201', '1000', '.', '+', '.', empty_col8 % "P00003"],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', empty_col8 % "P00004"],
        ]

        self.assertEqual(complement, expected)
示例#10
0
    def test_negative_strand(self):
        """
        Whole read is in single transcript, single segment. But the segment
        borders on intergenic (downstream).
        """
        gtf_neg_data = [
            i[:6] + ['-'] + i[7:] for i in intervals_to_list(self.gtf_data)
        ]
        gtf_neg = make_file_from_list(gtf_neg_data)
        bam = make_bam_file({
            'chromosomes': [('1', 1000)],
            'segments': [
                # (qname, flag, refname, pos, mapq, cigar, tags)
                ('name2:rbc:CCCC', 16, 0, 549, 255, [(0, 30)], {
                    'NH': 1
                }),
            ]
        })

        expected = [
            ['RNAmap', 'type', 'position', 'all', 'explicit'],
            ['CDS-intergenic', '20', '0.5', '0'],
            ['intergenic-CDS', '-80', '0.5', '0'],
        ]

        rnamaps.run(bam,
                    gtf_neg,
                    self.out,
                    self.strange,
                    self.cross_tr,
                    mismatches=1,
                    implicit_handling='split')
        self.assertEqual(expected, make_list_from_file(self.out))
示例#11
0
    def test_segment(self):
        fai = make_file_from_list([
            ['1', '2000'],
            ['MT', '500'],
        ],
                                  bedtool=False)

        command_basic = [
            'iCount',
            'segment',
            self.gtf,
            self.tmp1,
            fai,
            '-S',
            '40',  # Supress lower than ERROR messages.
        ]
        command_full = [
            'iCount',
            'segment',
            self.gtf,
            self.tmp1,
            fai,
            '--report_progress',
            '-S',
            '40',  # Supress lower than ERROR messages.
        ]

        self.assertEqual(subprocess.call(command_basic), 0)
        self.assertEqual(subprocess.call(command_full), 0)
示例#12
0
    def test_run(self):
        fin_annotation = make_file_from_list([
            [
                '1', '.', 'gene', '10', '20', '.', '+', '.',
                'gene_name "A"; gene_id "1";'
            ],
            [
                '1', '.', 'transcript', '10', '20', '.', '+', '.',
                'gene_name "B"; gene_id "1";'
            ],
            [
                '2', '.', 'CDS', '10', '20', '.', '+', '.',
                'gene_name "C"; gene_id "1";'
            ],
        ])

        fin_sites = make_file_from_list([
            ['1', '14', '15', '.', '3', '+'],
            ['1', '16', '17', '.', '5', '+'],
            ['2', '16', '17', '.', '5', '+'],
        ])

        fout_peaks = get_temp_file_name(extension='.bed.gz')
        fout_scores = get_temp_file_name(extension='.tsv.gz')

        peaks.run(fin_annotation, fin_sites, fout_peaks, scores=fout_scores)

        out_peaks = make_list_from_file(fout_peaks, fields_separator='\t')
        out_scores = make_list_from_file(fout_scores, fields_separator='\t')
        # Remove header:
        out_scores = out_scores[1:]

        expected_peaks = [
            ['1', '14', '15', 'A-1', '3', '+'],
            ['1', '16', '17', 'A-1', '5', '+'],
        ]
        expected_scores = [
            ['1', '14', '+', 'A', '1', '3', '8', '0.036198'],
            ['1', '16', '+', 'A', '1', '5', '8', '0.036198'],
            [
                '2', '16', '+', 'not_annotated', 'not_annotated', '5',
                'not_calculated', '1'
            ],
        ]

        self.assertEqual(out_peaks, expected_peaks)
        self.assertEqual(out_scores, expected_scores)
示例#13
0
    def test_limits_downstream(self):
        """Landmarks with too short upstream segment should not be used."""
        regions = make_file_from_list([
            ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'],
            ['chr1', '.', 'intron', '201', '350', '.', '+', '.', 'gene_name "A";'],
        ])
        landmarks = landmark.make_single_type_landmarks(regions, 'exon-intron')
        landmarks = [list(map(str, item)) for item in landmarks]
        self.assertEqual(landmarks, [])

        regions = make_file_from_list([
            ['chr1', '.', 'CDS', '151', '200', '.', '-', '.', 'gene_name "A";'],
            ['chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";'],
        ])
        landmarks = landmark.make_single_type_landmarks(regions, 'intron-exon')
        landmarks = [list(map(str, item)) for item in landmarks]
        self.assertEqual(landmarks, [])
示例#14
0
 def test_no_landmark(self):
     """Landmark is missing on this chromosome / stramd."""
     xlinks = make_file_from_list([
         ['chrX', '22', '23', '.', '3', '+'],
     ])
     distances, total_cdna = rnamaps.compute_distances(
         self.landmarks, xlinks, 'exon-intron')
     self.assertEqual(total_cdna, 3)
     self.assertEqual(distances, {})
示例#15
0
    def test_basic(self):
        segmentation = [
            # Transcript #1
            [
                '1', '.', 'ncRNA', '1', '10', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            [
                '1', '.', 'intron', '11', '20', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            [
                '1', '.', 'CDS', '21', '30', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            [
                '1', '.', 'UTR3', '31', '40', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            # Transcript #1
            [
                '1', '.', 'CDS', '5', '14', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            [
                '1', '.', 'intron', '15', '24', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            [
                '1', '.', 'CDS', '25', '34', '.', '+', '.',
                'biotype "A"; gene_name "X";'
            ],
            # Also negative strand:
            [
                '1', '.', 'CDS', '3', '32', '.', '-', '.',
                'biotype "A"; gene_name "X";'
            ],
        ]
        expected = [
            ['1', '0', '4', '.', '.', '+'],
            ['1', '4', '10', '.', '.', '+'],
            ['1', '10', '14', '.', '.', '+'],
            ['1', '14', '20', '.', '.', '+'],
            ['1', '20', '24', '.', '.', '+'],
            ['1', '24', '30', '.', '.', '+'],
            ['1', '30', '34', '.', '.', '+'],
            ['1', '34', '40', '.', '.', '+'],
            ['1', '2', '32', '.', '.', '-'],
        ]

        segmentation_file = make_file_from_list(segmentation)
        borders_file = region.construct_borders(BedTool(segmentation_file))
        results = make_list_from_file(borders_file, fields_separator='\t')
        self.assertEqual(
            expected,
            # Sort results by chrom, strand, start, stop
            sorted(results, key=lambda x: (x[0], x[-1], int(x[1]), int(x[2]))))
示例#16
0
def _make_types_length(annotation, subtype='biotype', excluded_types=None):
    """
    Run function `make_types_length_file` with data from `annotation`.
    """
    annotation_file = make_file_from_list(annotation)
    out_file = get_temp_file_name()
    fai = make_file_from_list(bedtool=False,
                              data=[
                                  ['1', '100'],
                                  ['6', '100'],
                                  ['20', '100'],
                              ])
    result, _ = summary.make_types_length_file(annotation_file,
                                               fai,
                                               out_file,
                                               subtype=subtype,
                                               excluded_types=excluded_types)
    return make_list_from_file(result, fields_separator='\t')
示例#17
0
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)

        self.landmarks = make_file_from_list([
            ['chr1', '10', '11', 'G1', '.', '+'],
            ['chr1', '20', '21', 'G1', '.', '+'],
            ['chr1', '20', '21', 'G2', '.', '-'],
            ['chr2', '10', '11', 'G3', '.', '+'],
        ])
示例#18
0
def merge_bed_wrapper(data):
    """
    TODO
    """
    files = []
    for file_ in data:
        files.append(make_file_from_list(file_))
    out_file = get_temp_file_name()
    merge_bed(out_file, files)
    return make_list_from_file(out_file, fields_separator='\t')
示例#19
0
    def test_all_good(self):
        gtf_in_data = list_to_intervals([
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
        ])
        gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data))

        gtf_out = get_temp_file_name()

        genome_file = make_file_from_list([
            ['1', '2000'],
            ['MT', '500'],
        ], bedtool=False)

        segment.get_segments(gtf_in_file, gtf_out, genome_file)
        gtf_out_data = list_to_intervals(make_list_from_file(gtf_out, fields_separator='\t'))

        expected = list_to_intervals([
            ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR5', '400', '409', '.', '+', '.',
             'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR3', '491', '500', '.', '+', '.',
             'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'],
        ])

        self.assertEqual(expected, gtf_out_data)

        out_dir = os.path.dirname(os.path.abspath(gtf_out))
        self.assertTrue(os.path.isfile(os.path.join(out_dir, region.REGIONS_FILE)))
        self.assertTrue(os.path.isfile(os.path.join(out_dir, 'landmarks.bed.gz')))
示例#20
0
def merge_bed_wrapper(data):
    """
    TODO
    """
    files = []
    for file_ in data:
        files.append(make_file_from_list(file_))
    out_file = tempfile.NamedTemporaryFile(delete=False).name
    return make_list_from_file(merge_bed(out_file, files),
                               fields_separator='\t')
示例#21
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))
        self.gtf_data = list_to_intervals([
            ['1', '.', 'intergenic', '1', '99', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            # Gene #1:
            ['1', '.', 'gene', '100', '499', '.', '+', '.',
             'gene_id "G1";'],
            # Transcript #1
            ['1', '.', 'transcript', '100', '249', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'UTR5', '100', '149', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "1";'],
            ['1', '.', 'intron', '150', '199', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'CDS', '200', '229', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "2";'],
            ['1', '.', 'intron', '230', '239', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'UTR3', '240', '249', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "3";'],

            # Transcript #2
            ['1', '.', 'transcript', '240', '499', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'CDS', '240', '299', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "1";'],
            ['1', '.', 'intron', '300', '399', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'CDS', '400', '499', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "2";'],

            # intergenic
            ['1', '.', 'intergenic', '500', '599', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],

            # Gene #1:
            ['1', '.', 'gene', '600', '999', '.', '+', '.',
             'gene_id "G2";'],

            # Transcript #3
            ['1', '.', 'transcript', '600', '799', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'CDS', '600', '649', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1";'],
            ['1', '.', 'intron', '650', '749', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'CDS', '750', '799', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2";'],

        ])
        self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
        self.strange = get_temp_file_name()
        self.cross_tr = get_temp_file_name()
        self.out = get_temp_file_name()
示例#22
0
    def test_no_required_attributes(self):
        """
        Raise error if transcript_id attribute is not present.
        """
        gtf = make_file_from_list([
            ['1', '.', 'transcript', '500', '600', '.', '+', '.', 'gene_id "G1";'],
        ])

        message = "First element in gene content is neither gene or transcript!"
        with self.assertRaisesRegex(Exception, message):
            list((segment._get_gene_content(gtf, ['1', 'MT'])))
示例#23
0
    def test_basic(self):
        regions = make_file_from_list([
            ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'],
            ['chr1', '.', 'intron', '201', '351', '.', '+', '.', 'gene_name "A";'],
        ])
        landmarks = landmark.make_single_type_landmarks(regions, 'exon-intron')
        landmarks = [list(map(str, item)) for item in landmarks]
        self.assertEqual(landmarks, [
            ['chr1', '200', '201', 'exon-intron;A', '.', '+'],
        ])

        regions = make_file_from_list([
            ['chr1', '.', 'CDS', '150', '200', '.', '-', '.', 'gene_name "A";'],
            ['chr1', '.', 'intron', '201', '351', '.', '-', '.', 'gene_name "A";'],
        ])
        landmarks = landmark.make_single_type_landmarks(regions, 'intron-exon')
        landmarks = [list(map(str, item)) for item in landmarks]
        self.assertEqual(landmarks, [
            ['chr1', '199', '200', 'intron-exon;A', '.', '-'],
        ])
示例#24
0
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)

        bed_data = [
            ['1', '4', '5', '.', '5', '+'],
            ['1', '5', '6', '.', '1', '+'],
            ['1', '5', '6', '.', '1', '-'],
            ['2', '5', '6', '.', '3', '+'],
        ]
        self.bed = make_file_from_list(bed_data, extension='bed')
        self.bedgraph = get_temp_file_name(extension='bedgraph')
示例#25
0
def _make_summary_report(annotation,
                         cross_links,
                         chrom_lengths,
                         subtype='biotype',
                         excluded_types=None):
    """
    Run function `make_summary_report` with input/output data as lists.
    """
    annotation_file = make_file_from_list(annotation)
    cross_links_file = make_file_from_list(cross_links)
    chrom_length_file = make_file_from_list(chrom_lengths, bedtool=False)
    out_file = tempfile.NamedTemporaryFile(delete=False).name

    summary.make_summary_report(annotation_file,
                                cross_links_file,
                                out_file,
                                chrom_length_file,
                                subtype=subtype,
                                excluded_types=excluded_types)
    return make_list_from_file(out_file, fields_separator='\t')
示例#26
0
 def test_basic(self):
     xlinks = make_file_from_list([
         ['chr1', '12', '13', '.', '3', '+'],
     ])
     distances, total_cdna = rnamaps.compute_distances(
         self.landmarks, xlinks, 'exon-intron')
     self.assertEqual(total_cdna, 3)
     self.assertEqual(distances, {
         'chr1__+__10__G1': {
             2: 3
         },
     })
示例#27
0
    def test_basic(self):
        # seg is compositon of BED6 and GTF interval:
        nonmerged = make_file_from_list([
            [
                '1', '.', 'UTR3', '1', '10', '.', '+', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
            [
                '1', '.', 'UTR3', '11', '20', '.', '+', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
            [
                '1', '.', 'UTR3', '21', '30', '.', '+', '.',
                'biotype "lncRNA";gene_id "id2";'
            ],
            [
                '1', '.', 'UTR3', '31', '40', '.', '+', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
            [
                '1', '.', 'UTR3', '31', '40', '.', '-', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
        ])

        expected = [
            [
                '1', '.', 'UTR3', '1', '20', '.', '+', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
            [
                '1', '.', 'UTR3', '21', '30', '.', '+', '.',
                'biotype "lncRNA";gene_id "id2";'
            ],
            [
                '1', '.', 'UTR3', '31', '40', '.', '+', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
            [
                '1', '.', 'UTR3', '31', '40', '.', '-', '.',
                'biotype "lncRNA";gene_id "id1";'
            ],
        ]

        region.merge_regions(nonmerged, self.tmp)
        results = make_list_from_file(self.tmp, fields_separator='\t')
        # Since order of attrs can be arbitrary, equality checks are more complex:
        for res, exp in zip(results, expected):
            self.assertEqual(res[:8], exp[:8])
            self.assertEqual(
                ';'.join(sorted(res[8].split(';'))),
                ';'.join(sorted(exp[8].split(';'))),
            )
示例#28
0
    def test_all_good(self):
        """
        * second gene has no 'gene' interval - but it is present in output as it should
        * last interval is on chromosome 2, but it is not in the output
        """
        gtf_data = list_to_intervals([
            ['1', '.', 'gene', '100', '300', '.', '+', '.',
             'gene_id "G1";'],
            ['1', '.', 'transcript', '100', '250', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'exon', '100', '150', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "1";'],
            ['1', '.', 'exon', '200', '250', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "2";'],
            ['1', '.', 'transcript', '150', '300', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'exon', '150', '200', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "1";'],
            ['1', '.', 'exon', '250', '300', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['2', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G3"; transcript_id "T4";'],
        ])
        gtf = make_file_from_list(intervals_to_list(gtf_data))

        gene1, gene2 = list(segment._get_gene_content(gtf, ['1', 'MT'], report_progress=True))

        expected1 = {
            'gene': gtf_data[0],
            'T1': gtf_data[1:4],
            'T2': gtf_data[4:7],
        }

        extra_gene = create_interval_from_list(
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'])
        expected2 = {
            'gene': extra_gene,
            'T3': gtf_data[7:-1],
        }

        self.assertEqual(gene1, expected1)
        self.assertEqual(gene2, expected2)
示例#29
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))
        self.gtf_data = list_to_intervals([
            [
                '1', '.', 'intergenic', '1', '2', '.', '+', '.',
                'gene_id "."; transcript_id ".";'
            ],
            # Gene #1:
            ['1', '.', 'gene', '3', '7', '.', '+', '.', 'gene_id "G1";'],
            # Transcript #1
            [
                '1', '.', 'transcript', '3', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1";'
            ],
            [
                '1', '.', 'CDS', '3', '3', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1"; exon_number "2";'
            ],
            [
                '1', '.', 'intron', '4', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1";'
            ],
            [
                '1', '.', 'UTR3', '5', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1"; exon_number "3";'
            ],

            # Transcript #2
            [
                '1', '.', 'transcript', '4', '7', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2";'
            ],
            [
                '1', '.', 'ncRNA', '4', '5', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2"; exon_number "1";'
            ],
            [
                '1', '.', 'intron', '6', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2";'
            ],
            [
                '1', '.', 'ncRNA', '7', '7', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2"; exon_number "2";'
            ],

            # intergenic
            [
                '1', '.', 'intergenic', '8', '9', '.', '+', '.',
                'gene_id "."; transcript_id ".";'
            ],
        ])
        self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
示例#30
0
    def test_basic(self):
        regions = make_file_from_list([
            ['chr1', '.', 'CDS', '150', '200', '.', '+', '.', 'gene_name "A";'],
            ['chr1', '.', 'intron', '201', '400', '.', '+', '.', 'gene_name "A";'],
            ['chr1', '.', 'CDS', '401', '600', '.', '+', '.', 'gene_name "A";'],
        ])

        landmarks = get_temp_file_name(extension='bed')
        landmark.make_landmarks(regions, landmarks)
        self.assertEqual(make_list_from_file(landmarks), [
            ['chr1', '200', '201', 'exon-intron;A', '.', '+'],
            ['chr1', '400', '401', 'intron-exon;A', '.', '+'],
        ])