示例#1
0
def test_get_cutouts_basic():
    intervals = Localizer(seedsize=10)
    intervals.add_seed_match('bogus-genome-chr2', 10)
    refrstream = open(data_file('bogus-genome/refr.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)
    cutouts = list(intervals.get_cutouts(refrseqs=seqs))
    assert len(cutouts) == 1
    assert cutouts[0].defline == 'bogus-genome-chr2_10-20'
    assert cutouts[0].sequence == 'GTTACATTAC'
示例#2
0
def test_get_cutouts_large_span():
    refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)

    intervals = Localizer(seedsize=21, delta=25)
    intervals.add_seed_match('simple', 100)
    intervals.add_seed_match('simple', 200)
    cutouts = intervals.get_cutouts(refrseqs=seqs, clusterdist=50)
    deflines = [c.defline for c in cutouts]
    assert deflines == ['simple_75-146', 'simple_175-246']

    intervals = Localizer(seedsize=21, delta=50)
    intervals.add_seed_match('simple', 100)
    intervals.add_seed_match('simple', 200)
    cutouts = intervals.get_cutouts(refrseqs=seqs, clusterdist=100)
    deflines = [c.defline for c in cutouts]
    assert deflines == ['simple_50-271']
示例#3
0
def test_get_cutouts_basic_2():
    intervals = Localizer(seedsize=21)
    intervals.add_seed_match('simple', 49)
    intervals.add_seed_match('simple', 52)
    intervals.add_seed_match('simple', 59)
    refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)
    cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=5))
    assert len(cutouts) == 1
    assert cutouts[0].defline == 'simple_44-85'
    assert cutouts[0].sequence == 'AATACTATGCCGATTTATTCTTACACAATTAAATTGCTAGT'
示例#4
0
def test_get_cutouts_basic_3():
    intervals = Localizer(seedsize=21, delta=10)
    intervals.add_seed_match('simple', 40)
    intervals.add_seed_match('simple', 80)
    intervals.add_seed_match('simple', 120)
    intervals.add_seed_match('simple', 500)
    refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)
    cutouts = list(intervals.get_cutouts(refrseqs=seqs, clusterdist=None))
    assert len(cutouts) == 1
    assert cutouts[0].defline == 'simple_30-531'
    assert len(cutouts[0].sequence) == 501
示例#5
0
def test_localizer_incl_excl():
    intervals = Localizer(seedsize=25)
    intervals.add_seed_match('1', 100)
    intervals.add_seed_match('1', 120)
    intervals.add_seed_match('12', 200)
    intervals.add_seed_match('12', 209)
    intervals.add_seed_match('12', 213)
    intervals.add_seed_match('X', 1234)
    intervals.add_seed_match('X', 1245)
    intervals.add_seed_match('Un', 13579)
    intervals.add_seed_match('Un', 13597)

    testint = [c.interval for c in intervals.get_cutouts()]
    assert sorted(testint) == [
        ('1', 100, 145),
        ('12', 200, 238),
        ('Un', 13579, 13622),
        ('X', 1234, 1270),
    ]

    intervals.exclpattern = 'Un'
    testint = [c.interval for c in intervals.get_cutouts()]
    assert sorted(testint) == [
        ('1', 100, 145),
        ('12', 200, 238),
        ('X', 1234, 1270),
    ]

    intervals.inclpattern = r'^\d+$'
    testint = [c.interval for c in intervals.get_cutouts()]
    assert sorted(testint) == [
        ('1', 100, 145),
        ('12', 200, 238),
    ]
示例#6
0
def test_localizer_simple():
    intervals = Localizer(seedsize=25)
    assert list(intervals.get_cutouts()) == []

    intervals.add_seed_match('chr1', 100)
    intervals.add_seed_match('chr1', 115)
    intervals.add_seed_match('chr2', 200)
    intervals.add_seed_match('chr2', 205)
    intervals.add_seed_match('chr2', 207)
    intervals.add_seed_match('chr2', 235008)
    intervals.add_seed_match('chr2', 235075)
    testint = [c.interval for c in intervals.get_cutouts()]
    print('DEBUG', testint)
    assert testint == [('chr1', 100, 140), ('chr2', 200, 232),
                       ('chr2', 235008, 235100)]
示例#7
0
def test_extract_regions_boundaries():
    refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)

    intervals = Localizer(seedsize=31)
    intervals.add_seed_match('simple', 15)
    cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=20))
    assert len(cutouts) == 1
    assert cutouts[0].defline == 'simple_0-66'

    intervals = Localizer(seedsize=31)
    intervals.add_seed_match('simple', 925)
    intervals.add_seed_match('simple', 955)
    intervals.add_seed_match('simple', 978)
    cutouts = list(intervals.get_cutouts(refrseqs=seqs, delta=20))
    assert len(cutouts) == 1
    assert cutouts[0].defline == 'simple_905-1000'
示例#8
0
def test_get_cutouts_missing_seq():
    intervals = Localizer(seedsize=21)
    intervals.add_seed_match('simple', 100)
    intervals.add_seed_match('simple', 200)
    intervals.add_seed_match('TheCakeIsALie', 42)
    intervals.add_seed_match('TheCakeIsALie', 100)
    intervals.add_seed_match('TheCakeIsALie', 77)
    refrstream = open(data_file('simple-genome-ctrl1.fa'), 'r')
    seqs = kevlar.seqio.parse_seq_dict(refrstream)
    with pytest.raises(KevlarRefrSeqNotFoundError, match=r'TheCakeIsALie'):
        list(intervals.get_cutouts(refrseqs=seqs))