def test_exons_returns_exon_sequence_ranges(self): self.assertEqual(self.transcript.exons(), [ SequenceRange('0', 3, 5), SequenceRange('0', 10, 20), SequenceRange('0', 30, 40), SequenceRange('0', 50, 60) ])
def test_condense(self): self.assertEqual(SequenceRange.condense(self.range_12, self.range_24), [SequenceRange('0', 1, 4)]) self.assertEqual( SequenceRange.condense(self.range_12, self.range_24, self.range_56), [SequenceRange('0', 1, 4), self.range_56])
def test_concat(self): self.assertEqual(self.range_12.concat(self.range_24), SequenceRange('0', 1, 4)) self.assertEqual(self.range_24.concat(self.range_12), SequenceRange('0', 1, 4)) self.assertRaises(ValueError, lambda: self.range_12.concat(self.range_56))
def test_transcript_range(self): self.assertEqual(self.transcript.transcript_range(1, 2), [SequenceRange('0', 11, 12)]) self.assertEqual(self.transcript.transcript_range(1, 6), [SequenceRange('0', 11, 16)]) self.assertEqual(self.transcript.transcript_range( 1, 16), [SequenceRange('0', 11, 20), SequenceRange('0', 30, 36)])
def test_bases_returns_base_pair_range(self): self.assertEqual( reference.bases(SequenceRange('1', 2, 8), self.ref_genome), 'AACCCC') self.assertEqual( reference.bases(SequenceRange('X', 15, 16), self.ref_genome), 't') self.assertEqual( reference.bases(SequenceRange('1', 0, 16), self.ref_genome), 'AAAACCCCGGGGTTTT')
def test_exons_returns_reversed_positions_when_strand_minus(self): self.transcript.plus_strand = False # Note: never do this outside of a # test. self.assertEqual(self.transcript.exons(), [ SequenceRange('0', 50, 60), SequenceRange('0', 30, 40), SequenceRange('0', 10, 20), SequenceRange('0', 3, 5) ])
def get_ranges(self): """Return the sequence ranges for an exon probe. If necessary, each probe half-sequence will be reverse-complemented so that the breakpoint is in the centre of the probe. We reverse-complement the first half-sequence if it's the start of an exon on the plus strand, or the end of an exon on the minus strand. The second half-sequence is reverse-complemented if it's the start of an exon on the minus strand or the end of an exon on the plus strand. If the arrow separator is used, the two sides of the probe will be rearranged if necessary so that the reading frames of the two exons will be preserved. For example: BAR |=========> .............................................. .............................................. <-------| FOO FOO-/BAR+ <----|==== FOO->BAR ====|<---- """ chromosome1 = self._spec["chromosome1"] chromosome2 = self._spec["chromosome2"] strand1 = self._spec["strand1"] strand2 = self._spec["strand2"] side1 = self._spec["side1"] side2 = self._spec["side2"] start1, end1, start2, end2 = self._get_ranges() if self._spec['separator'] == '->' and self._spec['strand1'] == '-': start1, start2 = start2, start1 end1, end2 = end2, end1 chromosome1, chromosome2 = chromosome2, chromosome1 strand1, strand2 = strand2, strand1 side1, side2 = side2, side1 return ( SequenceRange( chromosome1, start1, end1, reverse_complement=(side1 == strand1), ), SequenceRange( chromosome2, start2, end2, reverse_complement=(strand2 != side2), ), )
def test_nucleotide_index(self): transcript1, transcript2, transcript3, *rest = ANNOTATION self.assertEqual(transcript1.nucleotide_index(1), SequenceRange('1', 1, 2)) self.assertEqual(transcript2.nucleotide_index(2), SequenceRange('2', 9, 10)) transcript_3_indices = [22, 21, 20, 14, 13, 12, 11, 10] for base_pair, index in zip(transcript_3_indices, range(1, 11)): self.assertEqual(transcript3.nucleotide_index(index), SequenceRange('3', base_pair, base_pair + 1))
def codon_index(self, index): """Given a 1-based codon index, return a SequenceRange object representing that codon. """ base_index = self._transcript_index(index*3) if self.plus_strand: return SequenceRange(self.chromosome, base_index-2, base_index+1) else: return SequenceRange(self.chromosome, base_index, base_index+3)
def test_bases_with_reverse_complement(self): self.assertEqual( reference.bases(SequenceRange('1', 2, 8, reverse_complement=True), self.ref_genome), 'GGGGTT') self.assertEqual( reference.bases( SequenceRange('X', 15, 16, reverse_complement=True), self.ref_genome), 'a') self.assertEqual( reference.bases(SequenceRange('1', 0, 16, reverse_complement=True), self.ref_genome), 'AAAACCCCGGGGTTTT')
def test_condense(self): self.assertEqual( SequenceRange.condense( self.range_12, self.range_24), [SequenceRange('0', 1, 4)]) self.assertEqual( SequenceRange.condense( self.range_12, self.range_24, self.range_56), [SequenceRange('0', 1, 4), self.range_56])
def get_ranges(self): start1, end1 = _parse_range(self._spec['index1'], self._spec['operation1'], self._spec['bases1']) start2, end2 = _parse_range(self._spec['index2'], self._spec['operation2'], self._spec['bases2']) return (SequenceRange(self._spec['chromosome1'], start1, end1, reverse_complement=self._spec['rc_side_1']), SequenceRange(self._spec['chromosome2'], start2, end2, reverse_complement=self._spec['rc_side_2']))
def nucleotide_index(self, index): """Given a 1-based base pair index, return a SequenceRange object representing the base pair at that index in the transcript. """ base_index = self._transcript_index(index) return SequenceRange(self.chromosome, base_index, base_index+1)
def coding_exons(self): """As in `exons`, but with the UTRs trimmed out. """ cds_start = int(self._spec['cdsStart']) cds_end = int(self._spec['cdsEnd']) exon_positions = self.exons() positions = [] if not self.plus_strand: exon_positions.reverse() for exon in exon_positions: if exon.end < cds_start: pass elif exon.start <= cds_start <= cds_end <= exon.end: positions.append((cds_start, cds_end)) break elif exon.start <= cds_start <= exon.end: positions.append((cds_start, exon.end)) elif cds_start <= exon.start <= exon.end <= cds_end: positions.append((exon.start, exon.end)) elif exon.start <= cds_end <= exon.end: positions.append((exon.start, cds_end)) break elif cds_end <= exon.start: break else: assert False, "unreachable: {}/{}".format(self.name, self.gene_id) if not self.plus_strand: positions.reverse() return [SequenceRange(self.chromosome, start, end) for start, end in positions]
def exons(self): """Return the exon positions of a UCSC annotation feature. In a UCSC annotation file, the positions of the starts and ends of exons are stored as comma-separated strings: '20,30,40,' Given a dictionary with this data, we return a list of tuples: (exonStart, exonEnd) If the 'strand' of the row is '-', the function return the exons in reversed order. In this case, the first exon relative to the direction of transcription (which is probably what the user means), is the last exon along the chromosome reading from left to right along the '+' strand (which is how the data are stored in UCSC tables). Raises a FormattingError when the `row` does not appear to come from a valid UCSC gene table. """ exon_starts = self._spec['exonStarts'].split(',') exon_ends = self._spec['exonEnds'].split(',') positions = [] for start, end in zip(exon_starts, exon_ends): if start != '' and end != '': start, end = int(start), int(end) positions.append((start, end)) if not self.plus_strand: positions.reverse() return [SequenceRange(self.chromosome, start, end) for start, end in positions]
def get_ranges(self): bases = self._spec['bases'] chromosome = self._spec['chromosome'] index = self._spec['index'] - 1 # Convert from 0- to 1-based indexing left_buffer = bases // 2 - 1 right_buffer = bases - left_buffer return ( SequenceRange(chromosome, index-left_buffer, index), SequenceRange(chromosome, index, index+1, mutation=self._spec["mutation"]), SequenceRange(chromosome, index+1, index+right_buffer))
class TestSequenceRange(unittest.TestCase): def setUp(self): self.range_12 = SequenceRange('0', 1, 2) self.range_24 = SequenceRange('0', 2, 4) self.range_56 = SequenceRange('0', 5, 6) def test_concat(self): self.assertEqual(self.range_12.concat(self.range_24), SequenceRange('0', 1, 4)) self.assertEqual(self.range_24.concat(self.range_12), SequenceRange('0', 1, 4)) self.assertRaises(ValueError, lambda: self.range_12.concat(self.range_56)) def test_adjacent(self): self.assertTrue(self.range_12.adjacent(self.range_24)) self.assertTrue(self.range_24.adjacent(self.range_12)) self.assertFalse(self.range_12.adjacent(self.range_56)) def test_condense(self): self.assertEqual(SequenceRange.condense(self.range_12, self.range_24), [SequenceRange('0', 1, 4)]) self.assertEqual( SequenceRange.condense(self.range_12, self.range_24, self.range_56), [SequenceRange('0', 1, 4), self.range_56])
def transcript_range(self, start, end): """Return a list of SequenceRange objects representing the genomic location(s) of the transcript from `start` to `end`. More than one SequenceRange is returned if the requested range crosses exon boundaries. The `start` and `end` variables are 1-based left-inclusive, right-exclusive. """ ranges = [self.nucleotide_index(i) for i in range(start, end)] return SequenceRange.condense(*ranges)
def sequence_ranges(self): """Return a list of SequenceRange objects representing the variant with buffering sequence taken from the surrounding genomic sequence. """ chromosome, start, end, _, _ = self.index reference_length = len(self.reference) mutation_length = len(self.mutation) total_buffer = len(self) - mutation_length left_buffer = total_buffer // 2 right_buffer = total_buffer - left_buffer return [ SequenceRange(chromosome, start - left_buffer, start), SequenceRange(chromosome, start, start + reference_length, mutation=self.mutation, reverse_complement=not self.transcript.plus_strand), SequenceRange(chromosome, start + reference_length, start + reference_length + right_buffer) ]
def assert_mock_gene_in_file(self, annotation_file): """Assert that the mock gene is found in the `annotation_file`. `annotation_file` is a handle to one of the UCSC annotation files used for testing. """ annotations = annotation.parse_ucsc_file(annotation_file) matching_features = annotation.lookup_gene("MOCK_GENE", annotations) try: mock_row, = tuple(matching_features) exons = mock_row.exons() self.assertEqual(exons, [SequenceRange('0', 2, 3)]) except ValueError as error: self.fail("Unexpected number of mock genes: {}".format(error))
def sequence_ranges(self): """Return a list of SequenceRange objects representing the variant and a buffer sequence taken from the surrounding transcript sequence (i.e., intronic sequences are skipped). Raises an OutOfRange exception when the buffer sequences strays outside the range of the transcript. """ chromosome, start, end, _, _ = self.index reference_length = len(self.reference) mutation_length = len(self.mutation) total_buffer = len(self) - mutation_length left_buffer = total_buffer // 2 right_buffer = total_buffer - left_buffer reverse_complement = not self.transcript.plus_strand base = self.transcript.base_index(self.index) if not self.transcript.plus_strand: left_buffer, right_buffer = right_buffer, left_buffer sequence = ( self.transcript.transcript_range(base - left_buffer, base) + [ SequenceRange(chromosome, start, start + reference_length, mutation=self.mutation, reverse_complement=reverse_complement) ] + self.transcript.transcript_range( base + reference_length, base + reference_length + right_buffer)) if self.transcript.plus_strand: return sequence else: return reversed(sequence)
class TestSequenceRange(unittest.TestCase): def setUp(self): self.range_12 = SequenceRange('0', 1, 2) self.range_24 = SequenceRange('0', 2, 4) self.range_56 = SequenceRange('0', 5, 6) def test_concat(self): self.assertEqual( self.range_12.concat(self.range_24), SequenceRange('0', 1, 4)) self.assertEqual( self.range_24.concat(self.range_12), SequenceRange('0', 1, 4)) self.assertRaises( ValueError, lambda: self.range_12.concat(self.range_56)) def test_adjacent(self): self.assertTrue( self.range_12.adjacent(self.range_24)) self.assertTrue( self.range_24.adjacent(self.range_12)) self.assertFalse( self.range_12.adjacent(self.range_56)) def test_condense(self): self.assertEqual( SequenceRange.condense( self.range_12, self.range_24), [SequenceRange('0', 1, 4)]) self.assertEqual( SequenceRange.condense( self.range_12, self.range_24, self.range_56), [SequenceRange('0', 1, 4), self.range_56])
def test_bases_raises_NonContainedRange_on_range_outside_of_chromosome( self): message = "range \[1:100\] outside the range of chromosome '1'" with self.assertRaisesRegex(reference.NonContainedRange, message): reference.bases(SequenceRange('1', 1, 100), self.ref_genome)
def test_bases_raises_MissingChromosome_when_chromosome_key_missing(self): message = "no such chromosome: 'banana'" with self.assertRaisesRegex(reference.MissingChromosome, message): reference.bases(SequenceRange('banana', 1, 2), self.ref_genome)
def setUp(self): self.range_12 = SequenceRange('0', 1, 2) self.range_24 = SequenceRange('0', 2, 4) self.range_56 = SequenceRange('0', 5, 6)
def test_codon_index(self): transcript1, transcript2, transcript3, *rest = ANNOTATION self.assertEqual(transcript3.codon_index(1), SequenceRange('3', 20, 23)), self.assertEqual(transcript3.codon_index(2), SequenceRange('3', 12, 15)),
def test_coding_exons_returns_coding_sequence_ranges(self): self.assertEqual(self.transcript.coding_exons(), [ SequenceRange('0', 11, 20), SequenceRange('0', 30, 40), SequenceRange('0', 50, 59) ])
def test_exon_returns_exon_sequence_range_at_one_based_index(self): self.assertEqual(self.transcript.exon(2), SequenceRange('0', 10, 20))