def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50, seedsize=31, maxdiff=None, match=1, mismatch=2, gapopen=5, gapextend=0, min_ikmers=None, refrseqs=None, logstream=sys.stderr): while True: if queue.empty(): sleep(3) continue reads = queue.get() ccmatch = re.search(r'kvcc=(\d+)', reads[0].name) cc = ccmatch.group(1) if ccmatch else None message = '[kevlar::alac::make_call_from_reads' message += ' (thread={:d})]'.format(idx) message += ' grabbed partition={} from queue,'.format(cc) message += ' queue size now {:d}'.format(queue.qsize()) print(message, file=sys.stderr) # Assemble partitioned reads into contig(s) contigs = list(assemble_fml_asm(reads, logstream=logstream)) if min_ikmers is not None: # Apply min ikmer filter if it's set contigs = [c for c in contigs if len(c.annotations) >= min_ikmers] if len(contigs) == 0: queue.task_done() continue # Identify the genomic region(s) associated with each contig localizer = localize(contigs, refrfile, seedsize, delta=delta, maxdiff=maxdiff, refrseqs=refrseqs, logstream=logstream) targets = list(localizer) if len(targets) == 0: queue.task_done() continue # Align contigs to genomic targets to make variant calls caller = call(targets, contigs, match, mismatch, gapopen, gapextend, ksize, refrfile) for varcall in caller: if cc: varcall.annotate('PART', cc) calls.append(varcall) queue.task_done()
def test_localize_no_match(capsys): refr_file = data_file('fiveparts-refr.fa.gz') contig_file = data_file('wasp-pass.contig.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r')) pstream = kevlar.parse_partitioned_reads(contigstream) localizer = localize(pstream, refr_file, seedsize=41, debug=True) cutoutdata = list(localizer) assert cutoutdata == [] out, err = capsys.readouterr() assert 'WARNING: no reference matches' in err
def test_localize_new_single_partition(partid, testdeflines): refr_file = data_file('fiveparts-refr.fa.gz') contig_file = data_file('fiveparts.contigs.augfasta.gz') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r')) pstream = kevlar.parse_single_partition(contigstream, partid) localizer = localize(pstream, refr_file, seedsize=51) cutoutdata = list(localizer) partids = [partid for partid, gdna in cutoutdata] gdnas = [gdna for partid, gdna in cutoutdata] deflines = sorted([g.defline for g in gdnas]) assert deflines == testdeflines
def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1, mismatch=2, gapopen=5, gapextend=0, greedy=False, logstream=sys.stderr): assembler = assemble_greedy if greedy else assemble_fml_asm for partition in pstream: contigs = [c for c in assembler(partition, logstream=logstream)] targets = [t for t in localize(contigs, refrfile, ksize, delta=delta)] caller = call( targets, contigs, match, mismatch, gapopen, gapextend, ksize ) for varcall in caller: yield varcall
def test_localize_new(): refr_file = data_file('fiveparts-refr.fa.gz') contig_file = data_file('fiveparts.contigs.augfasta.gz') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r')) pstream = kevlar.parse_partitioned_reads(contigstream) localizer = localize(pstream, refr_file, seedsize=51, debug=True) cutoutdata = list(localizer) partids = [partid for partid, gdna in cutoutdata] gdnas = [gdna for partid, gdna in cutoutdata] deflines = [g.defline for g in gdnas] assert partids == ['1', '1', '2', '3', '4', '5'] assert sorted(deflines) == sorted([ 'seq1_284663-284950', 'seq1_1924681-1925049', 'seq1_1660589-1660884', 'seq1_2315741-2316037', 'seq1_2321099-2321322', 'seq1_593102-593389' ])