示例#1
0
def make_call_from_reads(queue,
                         idx,
                         calls,
                         refrfile,
                         ksize=31,
                         delta=50,
                         seedsize=31,
                         maxdiff=None,
                         match=1,
                         mismatch=2,
                         gapopen=5,
                         gapextend=0,
                         min_ikmers=None,
                         refrseqs=None,
                         logstream=sys.stderr):
    while True:
        if queue.empty():
            sleep(3)
            continue
        reads = queue.get()
        ccmatch = re.search(r'kvcc=(\d+)', reads[0].name)
        cc = ccmatch.group(1) if ccmatch else None
        message = '[kevlar::alac::make_call_from_reads'
        message += ' (thread={:d})]'.format(idx)
        message += ' grabbed partition={} from queue,'.format(cc)
        message += ' queue size now {:d}'.format(queue.qsize())
        print(message, file=sys.stderr)

        # Assemble partitioned reads into contig(s)
        contigs = list(assemble_fml_asm(reads, logstream=logstream))
        if min_ikmers is not None:
            # Apply min ikmer filter if it's set
            contigs = [c for c in contigs if len(c.annotations) >= min_ikmers]
        if len(contigs) == 0:
            queue.task_done()
            continue

        # Identify the genomic region(s) associated with each contig
        localizer = localize(contigs,
                             refrfile,
                             seedsize,
                             delta=delta,
                             maxdiff=maxdiff,
                             refrseqs=refrseqs,
                             logstream=logstream)
        targets = list(localizer)
        if len(targets) == 0:
            queue.task_done()
            continue

        # Align contigs to genomic targets to make variant calls
        caller = call(targets, contigs, match, mismatch, gapopen, gapextend,
                      ksize, refrfile)
        for varcall in caller:
            if cc:
                varcall.annotate('PART', cc)
            calls.append(varcall)
        queue.task_done()
示例#2
0
def test_localize_no_match(capsys):
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('wasp-pass.contig.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_partitioned_reads(contigstream)
    localizer = localize(pstream, refr_file, seedsize=41, debug=True)
    cutoutdata = list(localizer)
    assert cutoutdata == []
    out, err = capsys.readouterr()
    assert 'WARNING: no reference matches' in err
示例#3
0
def test_localize_new_single_partition(partid, testdeflines):
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('fiveparts.contigs.augfasta.gz')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_single_partition(contigstream, partid)
    localizer = localize(pstream, refr_file, seedsize=51)
    cutoutdata = list(localizer)
    partids = [partid for partid, gdna in cutoutdata]
    gdnas = [gdna for partid, gdna in cutoutdata]
    deflines = sorted([g.defline for g in gdnas])
    assert deflines == testdeflines
示例#4
0
文件: alac.py 项目: jchow32/kevlar
def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1,
         mismatch=2, gapopen=5, gapextend=0, greedy=False,
         logstream=sys.stderr):
    assembler = assemble_greedy if greedy else assemble_fml_asm

    for partition in pstream:
        contigs = [c for c in assembler(partition, logstream=logstream)]
        targets = [t for t in localize(contigs, refrfile, ksize, delta=delta)]
        caller = call(
            targets, contigs, match, mismatch, gapopen, gapextend, ksize
        )
        for varcall in caller:
            yield varcall
示例#5
0
def test_localize_new():
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('fiveparts.contigs.augfasta.gz')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_partitioned_reads(contigstream)
    localizer = localize(pstream, refr_file, seedsize=51, debug=True)
    cutoutdata = list(localizer)
    partids = [partid for partid, gdna in cutoutdata]
    gdnas = [gdna for partid, gdna in cutoutdata]
    deflines = [g.defline for g in gdnas]
    assert partids == ['1', '1', '2', '3', '4', '5']
    assert sorted(deflines) == sorted([
        'seq1_284663-284950', 'seq1_1924681-1925049', 'seq1_1660589-1660884',
        'seq1_2315741-2316037', 'seq1_2321099-2321322', 'seq1_593102-593389'
    ])