def test_get_cutouts_basic(): intervals = Localizer(seedsize=10) intervals.add_seed_match('bogus-genome-chr2', 10) refrstream = open(data_file('bogus-genome/refr.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) cutouts = list(intervals.get_cutouts(refrseqs=seqs)) assert len(cutouts) == 1 assert cutouts[0].defline == 'bogus-genome-chr2_10-20' assert cutouts[0].sequence == 'GTTACATTAC'
def test_get_cutouts_large_span(): refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) intervals = Localizer(seedsize=21, delta=25) intervals.add_seed_match('simple', 100) intervals.add_seed_match('simple', 200) cutouts = intervals.get_cutouts(refrseqs=seqs, clusterdist=50) deflines = [c.defline for c in cutouts] assert deflines == ['simple_75-146', 'simple_175-246'] intervals = Localizer(seedsize=21, delta=50) intervals.add_seed_match('simple', 100) intervals.add_seed_match('simple', 200) cutouts = intervals.get_cutouts(refrseqs=seqs, clusterdist=100) deflines = [c.defline for c in cutouts] assert deflines == ['simple_50-271']
def test_get_cutouts_basic_2(): intervals = Localizer(seedsize=21) intervals.add_seed_match('simple', 49) intervals.add_seed_match('simple', 52) intervals.add_seed_match('simple', 59) refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=5)) assert len(cutouts) == 1 assert cutouts[0].defline == 'simple_44-85' assert cutouts[0].sequence == 'AATACTATGCCGATTTATTCTTACACAATTAAATTGCTAGT'
def test_get_cutouts_basic_3(): intervals = Localizer(seedsize=21, delta=10) intervals.add_seed_match('simple', 40) intervals.add_seed_match('simple', 80) intervals.add_seed_match('simple', 120) intervals.add_seed_match('simple', 500) refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) cutouts = list(intervals.get_cutouts(refrseqs=seqs, clusterdist=None)) assert len(cutouts) == 1 assert cutouts[0].defline == 'simple_30-531' assert len(cutouts[0].sequence) == 501
def test_localizer_incl_excl(): intervals = Localizer(seedsize=25) intervals.add_seed_match('1', 100) intervals.add_seed_match('1', 120) intervals.add_seed_match('12', 200) intervals.add_seed_match('12', 209) intervals.add_seed_match('12', 213) intervals.add_seed_match('X', 1234) intervals.add_seed_match('X', 1245) intervals.add_seed_match('Un', 13579) intervals.add_seed_match('Un', 13597) testint = [c.interval for c in intervals.get_cutouts()] assert sorted(testint) == [ ('1', 100, 145), ('12', 200, 238), ('Un', 13579, 13622), ('X', 1234, 1270), ] intervals.exclpattern = 'Un' testint = [c.interval for c in intervals.get_cutouts()] assert sorted(testint) == [ ('1', 100, 145), ('12', 200, 238), ('X', 1234, 1270), ] intervals.inclpattern = r'^\d+$' testint = [c.interval for c in intervals.get_cutouts()] assert sorted(testint) == [ ('1', 100, 145), ('12', 200, 238), ]
def test_localizer_simple(): intervals = Localizer(seedsize=25) assert list(intervals.get_cutouts()) == [] intervals.add_seed_match('chr1', 100) intervals.add_seed_match('chr1', 115) intervals.add_seed_match('chr2', 200) intervals.add_seed_match('chr2', 205) intervals.add_seed_match('chr2', 207) intervals.add_seed_match('chr2', 235008) intervals.add_seed_match('chr2', 235075) testint = [c.interval for c in intervals.get_cutouts()] print('DEBUG', testint) assert testint == [('chr1', 100, 140), ('chr2', 200, 232), ('chr2', 235008, 235100)]
def test_extract_regions_boundaries(): refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) intervals = Localizer(seedsize=31) intervals.add_seed_match('simple', 15) cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=20)) assert len(cutouts) == 1 assert cutouts[0].defline == 'simple_0-66' intervals = Localizer(seedsize=31) intervals.add_seed_match('simple', 925) intervals.add_seed_match('simple', 955) intervals.add_seed_match('simple', 978) cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=20)) assert len(cutouts) == 1 assert cutouts[0].defline == 'simple_905-1000'
def test_get_cutouts_missing_seq(): intervals = Localizer(seedsize=21) intervals.add_seed_match('simple', 100) intervals.add_seed_match('simple', 200) intervals.add_seed_match('TheCakeIsALie', 42) intervals.add_seed_match('TheCakeIsALie', 100) intervals.add_seed_match('TheCakeIsALie', 77) refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r') seqs = kevlar.seqio.parse_seq_dict(refrstream) with pytest.raises(KevlarRefrSeqNotFoundError, match=r'TheCakeIsALie'): list(intervals.get_cutouts(refrseqs=seqs))