예제 #1
0
    def test_contig_coords(self):
        '''contig_coords() should get the coords of all contigs in a sequence correctly'''
        test_seqs = [
            fastn.Fasta('ID', 'ACGT'),
            fastn.Fasta('ID', 'NACGT'),
            fastn.Fasta('ID', 'NNACGT'),
            fastn.Fasta('ID', 'ACGTN'),
            fastn.Fasta('ID', 'ACGTNN'),
            fastn.Fasta('ID', 'NANNCGT'),
            fastn.Fasta('ID', 'ANNCGTNNAAAAA')
        ]

        correct_coords = [[genome_intervals.Interval(0, 3)],
                          [genome_intervals.Interval(1, 4)],
                          [genome_intervals.Interval(2, 5)],
                          [genome_intervals.Interval(0, 3)],
                          [genome_intervals.Interval(0, 3)],
                          [
                              genome_intervals.Interval(1, 1),
                              genome_intervals.Interval(4, 6)
                          ],
                          [
                              genome_intervals.Interval(0, 0),
                              genome_intervals.Interval(3, 5),
                              genome_intervals.Interval(8, 12)
                          ]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].contig_coords()
            self.assertListEqual(correct_coords[i], gaps)
 def test_intersection(self):
     '''Intersection should either return None or the correct intersection'''
     a = genome_intervals.Interval(5, 10)
     b = genome_intervals.Interval(8, 15)
     c = genome_intervals.Interval(12, 20)
     self.assertEqual(a.intersection(c), None)
     self.assertEqual(a.intersection(b), genome_intervals.Interval(8, 10))
    def test_length_sum_from_list(self):
        '''Test that total length of intervals is summed correctly'''
        a = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(4, 5),
            genome_intervals.Interval(10, 19)
        ]

        self.assertEqual(14, genome_intervals.length_sum_from_list(a))
 def test_init(self):
     '''Throw error if try to construct genome_interval from a non-int, or end<start'''
     with self.assertRaises(genome_intervals.Error):
         genome_intervals.Interval('a', 1)
     with self.assertRaises(genome_intervals.Error):
         genome_intervals.Interval(1, 'a')
     with self.assertRaises(genome_intervals.Error):
         genome_intervals.Interval('a', 'a')
     with self.assertRaises(genome_intervals.Error):
         genome_intervals.Interval(3, 2)
 def test_comparisons(self):
     '''< and <= should work as expected'''
     self.assertTrue(
         genome_intervals.Interval(1, 2) < genome_intervals.Interval(2, 2))
     self.assertTrue(
         genome_intervals.Interval(1, 2) <= genome_intervals.Interval(2, 2))
     self.assertFalse(
         genome_intervals.Interval(2, 2) <= genome_intervals.Interval(1, 2))
     self.assertFalse(
         genome_intervals.Interval(2, 2) < genome_intervals.Interval(1, 2))
     self.assertFalse(
         genome_intervals.Interval(2, 2) < genome_intervals.Interval(2, 2))
 def test_union_flll_gap(self):
     '''union_fill_gap() should ignore intersections and return the maximum range of coords'''
     a = genome_intervals.Interval(5, 10)
     b = genome_intervals.Interval(8, 15)
     c = genome_intervals.Interval(12, 20)
     d = genome_intervals.Interval(21, 22)
     self.assertEqual(a.union_fill_gap(c), genome_intervals.Interval(5, 20))
     self.assertEqual(c.union_fill_gap(a), genome_intervals.Interval(5, 20))
     self.assertEqual(a.union_fill_gap(b), genome_intervals.Interval(5, 15))
     self.assertEqual(b.union_fill_gap(a), genome_intervals.Interval(5, 15))
     self.assertEqual(c.union_fill_gap(d),
                      genome_intervals.Interval(12, 22))
     self.assertEqual(d.union_fill_gap(c),
                      genome_intervals.Interval(12, 22))
예제 #7
0
 def gaps(self, min_length=1):
     gaps = []
     regex = re.compile('N+', re.IGNORECASE)
     for m in regex.finditer(self.seq):
         if m.span()[1] - m.span()[0] + 1 >= min_length:
             gaps.append(
                 genome_intervals.Interval(m.span()[0],
                                           m.span()[1] - 1))
     return gaps
 def test_union(self):
     '''Union should either return None or the correct union'''
     a = genome_intervals.Interval(5, 10)
     b = genome_intervals.Interval(8, 15)
     c = genome_intervals.Interval(12, 20)
     d = genome_intervals.Interval(21, 22)
     self.assertEqual(a.union(c), None)
     self.assertEqual(c.union(a), None)
     self.assertEqual(a.union(b), genome_intervals.Interval(5, 15))
     self.assertEqual(b.union(a), genome_intervals.Interval(5, 15))
     self.assertEqual(c.union(d), genome_intervals.Interval(12, 22))
     self.assertEqual(d.union(c), genome_intervals.Interval(12, 22))
    def test_merge_overlapping_in_list(self):
        '''merge_overlapping_in_list() merges correctly'''
        a = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(51, 60),
            genome_intervals.Interval(10, 20),
            genome_intervals.Interval(20, 30),
            genome_intervals.Interval(20, 30),
            genome_intervals.Interval(29, 50),
            genome_intervals.Interval(65, 70)
        ]

        b = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(10, 60),
            genome_intervals.Interval(65, 70)
        ]

        genome_intervals.merge_overlapping_in_list(a)
        self.assertSequenceEqual(a, b)
예제 #10
0
    def contig_coords(self):
        # contigs are the opposite of gaps, so work out the coords from the gap coords
        gaps = self.gaps()

        if len(gaps) == 0:
            return [genome_intervals.Interval(0, len(self) - 1)]

        coords = [0]
        for g in gaps:
            if g.start == 0:
                coords = [g.end + 1]
            else:
                coords += [g.start - 1, g.end + 1]

        if coords[-1] + 1 < len(self):
            coords.append(len(self) - 1)

        return [
            genome_intervals.Interval(coords[i], coords[i + 1])
            for i in range(0,
                           len(coords) - 1, 2)
        ]
def get_nucmer_hits(coords_file):
    qry_hits = {}
    ref_hits = {}

    nucmer_reader = nucmer.file_reader(coords_file)
    for hit in nucmer_reader:
        # nucmer hits are 1-based. INside the script, use 0-based.
        start, end = sorted([hit.ref_start - 1, hit.ref_end - 1])
        if hit.ref_name not in ref_hits:
            ref_hits[hit.ref_name] = []
        ref_hits[hit.ref_name].append(genome_intervals.Interval(start, end))

        start, end = sorted([hit.qry_start - 1, hit.qry_end - 1])
        if hit.qry_name not in qry_hits:
            qry_hits[hit.qry_name] = []
        qry_hits[hit.qry_name].append(genome_intervals.Interval(start, end))

    for l in ref_hits.values():
        genome_intervals.merge_overlapping_in_list(l)
    for l in qry_hits.values():
        genome_intervals.merge_overlapping_in_list(l)


    return ref_hits, qry_hits
def file2regions(fname):
    regions = {}

    f = utils.open_file_read(fname)

    for line in f:
        if line.startswith('#'):
            continue

        (chr, start, end) = line.rstrip().split()
        if chr not in regions:
            regions[chr] = []

        regions[chr].append(genome_intervals.Interval(start, end))

    utils.close(f)
    return regions
    def test_intersects(self):
        '''Intersection of two intervals should do the right thing'''
        a = genome_intervals.Interval(5, 10)
        no_intersect = [
            genome_intervals.Interval(3, 4),
            genome_intervals.Interval(11, 20)
        ]
        intersect = [
            genome_intervals.Interval(3, 5),
            genome_intervals.Interval(3, 6),
            genome_intervals.Interval(9, 12),
            genome_intervals.Interval(10, 12),
            genome_intervals.Interval(6, 7),
            genome_intervals.Interval(1, 20)
        ]

        for i in no_intersect:
            self.assertFalse(a.intersects(i),
                             'shouldn\'t intersect: ' + str(a) + ', ' + str(i))

        for i in intersect:
            self.assertTrue(a.intersects(i),
                            'should intersect: ' + str(a) + ', ' + str(i))
예제 #14
0
    def test_gaps(self):
        '''gaps() should find the gaps in a sequence correctly'''
        test_seqs = [
            fastn.Fasta('ID', 'ACGT'),
            fastn.Fasta('ID', 'NACGT'),
            fastn.Fasta('ID', 'NACGTN'),
            fastn.Fasta('ID', 'ANNCGT'),
            fastn.Fasta('ID', 'NANNCGTNN')
        ]

        correct_gaps = [[], [genome_intervals.Interval(0, 0)],
                        [
                            genome_intervals.Interval(0, 0),
                            genome_intervals.Interval(5, 5)
                        ], [genome_intervals.Interval(1, 2)],
                        [
                            genome_intervals.Interval(0, 0),
                            genome_intervals.Interval(2, 3),
                            genome_intervals.Interval(7, 8)
                        ]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].gaps()
            self.assertListEqual(correct_gaps[i], gaps)
예제 #15
0
    if not sam_record.is_forward_strand():
        sam_record.cigar.reverse()

    hit_start = 1
    hit_end = len(sam_record.seq)

    if sam_record.cigar.operations[0].operator == 'S':
        hit_start = sam_record.cigar.operations[0].number

    if sam_record.cigar.operations[-1].operator == 'S':
        hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number

    if sam_record.id not in read_hit_coords:
        read_hit_coords[sam_record.id] = []

    read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1))

external_progs.bwa_index_clean(bwa_index)
os.unlink(bwa_sam)


seq_reader = fastn.file_reader(options.reads_in)
f_fa = utils.open_file_write(options.outprefix + '.fq')
f_log = utils.open_file_write(options.outprefix + '.log')

for seq in seq_reader:
    if seq.id not in read_hit_coords:
        print(seq, file=f_fa)
        print(seq.id, 'no hit', sep='\t', file=f_log)
    else:
        hits = read_hit_coords[seq.id]
예제 #16
0
    d['step'] = int(step)
    return d


delete_range = range2dic(options.delete_range)
insert_range = range2dic(options.insert_range)

# convert the -d regions into sequence name, start and end coords
to_delete = {}
if options.delete:
    for s in options.delete:
        id, coords = s.rsplit(':')
        start, end = [int(x) - 1 for x in coords.split('-')]
        if id not in to_delete:
            to_delete[id] = []
        to_delete[id].append(genome_intervals.Interval(start, end))

to_insert = {}
if options.insert:
    for s in options.insert:
        id, pos, bases = s.rsplit(':', 2)
        pos = int(pos) - 1
        bases = int(bases)
        if id not in to_insert:
            to_insert[id] = []
        to_insert[id].append((pos, bases))

assert len(to_delete) * len(to_insert) == 0

# merge overlapping regions to be deleted
for l in to_delete.values():
    def test_contains(self):
        '''Check that contains() works as expected'''
        a = genome_intervals.Interval(5, 10)
        not_contained = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(4, 5),
            genome_intervals.Interval(4, 10),
            genome_intervals.Interval(4, 11),
            genome_intervals.Interval(5, 11),
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(9, 11),
            genome_intervals.Interval(10, 11),
            genome_intervals.Interval(11, 20)
        ]

        contained = [
            genome_intervals.Interval(5, 5),
            genome_intervals.Interval(5, 10),
            genome_intervals.Interval(6, 7),
            genome_intervals.Interval(6, 10),
            genome_intervals.Interval(10, 10)
        ]

        for i in not_contained:
            self.assertFalse(a.contains(i),
                             'shouldn\'t contain: ' + str(a) + ', ' + str(i))

        for i in contained:
            self.assertTrue(a.contains(i),
                            'should contain: ' + str(a) + ', ' + str(i))
 def test_len(self):
     self.assertEqual(len(genome_intervals.Interval(1, 2)), 2)
     self.assertEqual(len(genome_intervals.Interval(1, 1)), 1)
     self.assertEqual(len(genome_intervals.Interval(10, 20)), 11)
# load hits into hash. key=ref_name, value=another hash with key=qry_name, value=list of hit positions in that ref seq
nucmer_hits = {}
contigs_to_print = {}

nucmer_reader = nucmer.file_reader(nucmer_out_coords)

for hit in nucmer_reader:
    if hit.ref_name not in nucmer_hits:
        nucmer_hits[hit.ref_name] = {}

    if hit.qry_name not in nucmer_hits[hit.ref_name]:
        nucmer_hits[hit.ref_name][hit.qry_name] = []

    nucmer_hits[hit.ref_name][hit.qry_name].append(
        genome_intervals.Interval(min(hit.ref_start, hit.ref_end),
                                  max(hit.ref_start, hit.ref_end)))

# merge all the overalpping hits for each list of hits corresponding to one contig
for ref_name, d in nucmer_hits.items():
    for qry_name, hits in d.items():
        genome_intervals.merge_overlapping_in_list(hits)

        for hit in hits:
            if hit.end - hit.start + 1 >= options.min_seq_length:
                if ref_name not in contigs_to_print:
                    contigs_to_print[ref_name] = []

                contigs_to_print[ref_name].append(copy.copy(hit))

# remove any contigs that are completely contained in another contig
for ref, l in contigs_to_print.items():
# get query sequence lengths and gap positions - add each gap coord to the
# list of covered positions for each sequence
for seq in seq_reader:
    assert seq.id not in seq_lengths
    seq_lengths[seq.id] = len(seq)
    covered_regions[seq.id] = seq.gaps()

nucmer_reader = nucmer.file_reader(options.nucmer_coords)

for hit in nucmer_reader:
    assert hit.qry_name in seq_lengths

    # gaps are stored with coords starting from zero. Nucmer starts at 1, so need to decrement the coords
    start, end = sorted([hit.qry_start - 1, hit.qry_end - 1])
    covered_regions[hit.qry_name].append(genome_intervals.Interval(start, end))

# merge the covered regions
for l in covered_regions.values():
    genome_intervals.merge_overlapping_in_list(l)

f = utils.open_file_write(options.outfile)

# get the regions that are not covered
for id, covered in covered_regions.items():
    not_covered = []

    if len(covered) == 0:
        not_covered = [[1, seq_lengths[id]]]
    else:
        if covered[0].start != 0:
    def test_remove_contained_in_list(self):
        '''test_remove_contained_in_list removes the right elements of list'''
        a = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(4, 4),
            genome_intervals.Interval(4, 5),
            genome_intervals.Interval(5, 6),
            genome_intervals.Interval(7, 9),
            genome_intervals.Interval(8, 10),
            genome_intervals.Interval(9, 11),
            genome_intervals.Interval(20, 25),
            genome_intervals.Interval(20, 24),
            genome_intervals.Interval(20, 26),
            genome_intervals.Interval(30, 38),
            genome_intervals.Interval(30, 37),
            genome_intervals.Interval(30, 36),
            genome_intervals.Interval(30, 35),
            genome_intervals.Interval(30, 35),
            genome_intervals.Interval(32, 33),
            genome_intervals.Interval(38, 50),
            genome_intervals.Interval(65, 70),
            genome_intervals.Interval(67, 70)
        ]

        b = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(4, 5),
            genome_intervals.Interval(5, 6),
            genome_intervals.Interval(7, 9),
            genome_intervals.Interval(8, 10),
            genome_intervals.Interval(9, 11),
            genome_intervals.Interval(20, 26),
            genome_intervals.Interval(30, 38),
            genome_intervals.Interval(38, 50),
            genome_intervals.Interval(65, 70)
        ]

        genome_intervals.remove_contained_in_list(a)
        self.assertSequenceEqual(a, b)
    def test_intersection(self):
        '''intersection() should correctly intersect two lists of intervals'''
        a = [
            genome_intervals.Interval(1, 2),
            genome_intervals.Interval(10, 20),
            genome_intervals.Interval(51, 52),
            genome_intervals.Interval(54, 55),
            genome_intervals.Interval(57, 58)
        ]

        b = [
            genome_intervals.Interval(5, 6),
            genome_intervals.Interval(9, 11),
            genome_intervals.Interval(13, 14),
            genome_intervals.Interval(17, 18),
            genome_intervals.Interval(20, 25),
            genome_intervals.Interval(50, 60)
        ]

        i = [
            genome_intervals.Interval(10, 11),
            genome_intervals.Interval(13, 14),
            genome_intervals.Interval(17, 18),
            genome_intervals.Interval(20, 20),
            genome_intervals.Interval(51, 52),
            genome_intervals.Interval(54, 55),
            genome_intervals.Interval(57, 58)
        ]

        self.assertSequenceEqual(genome_intervals.intersection(a, b), i)
        self.assertSequenceEqual(genome_intervals.intersection(b, a), i)
    'Makes a random genome with sequence lengths and names determined by an fai file. IMPORTANT: not really random, at the moment every base will be an A (or an N if --gaps_file used)',
    usage='%(prog)s [options] <fai file> <outfile>')
parser.add_argument(
    '--gaps_file',
    help='File of gaps, each line in the form: "chr start end" (tab separated)'
)
parser.add_argument('fai_file', help='Name of fai file')
parser.add_argument('outfile', help='Name of output fasta file')
options = parser.parse_args()

gaps = {}
if options.gaps_file:
    f = utils.open_file_read(options.gaps_file)
    for line in f:
        (id, start, end) = line.rstrip().split('\t')
        gap = genome_intervals.Interval(int(start) - 1, int(end) - 1)
        if id not in gaps:
            gaps[id] = []
        gaps[id].append(gap)
    utils.close(f)

f_in = utils.open_file_read(options.fai_file)
f_out = utils.open_file_write(options.outfile)

for line in f_in:
    a = line.rstrip().split()
    fa = fastn.Fasta(a[0], 'A' * int(a[1]))

    if fa.id in gaps:
        fa.seq = list(fa.seq)
        for gap in gaps[fa.id]: