示例#1
0
文件: filter.py 项目: jchow32/kevlar
def main(args):
    timer = kevlar.Timer()
    timer.start()

    mask = load_mask(args.mask,
                     args.ksize,
                     args.mask_memory,
                     maxfpr=args.mask_max_fpr,
                     savefile=args.save_mask,
                     logstream=args.logfile)
    readstream = kevlar.seqio.afxstream(args.augfastq)
    outstream = kevlar.open(args.out, 'w')
    filterstream = filter(readstream,
                          mask,
                          minabund=args.min_abund,
                          ksize=args.ksize,
                          memory=args.abund_memory,
                          maxfpr=args.abund_max_fpr,
                          logstream=args.logfile)
    for record in filterstream:
        kevlar.print_augmented_fastx(record, outstream)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::filter]', message, file=args.logfile)
示例#2
0
文件: novel.py 项目: jchow32/kevlar
def main(args):
    timer = kevlar.Timer()
    timer.start()
    if (not args.num_bands) is not (not args.band):
        raise ValueError('Must specify --num-bands and --band together')
    myband = args.band - 1 if args.band else None

    timer.start('loadall')
    print('[kevlar::novel] Loading control samples', file=args.logfile)
    timer.start('loadctrl')
    controls = load_samples(args.control_counts, args.control, args.ksize,
                            args.memory, args.max_fpr, args.num_bands, myband,
                            args.threads, args.logfile)
    elapsed = timer.stop('loadctrl')
    message = 'Control samples loaded in {:.2f} sec'.format(elapsed)
    print('[kevlar::novel]', message, file=args.logfile)

    print('[kevlar::novel] Loading case samples', file=args.logfile)
    timer.start('loadcases')
    cases = load_samples(args.case_counts, args.case, args.ksize, args.memory,
                         args.max_fpr, args.num_bands, myband, args.threads,
                         args.logfile)
    elapsed = timer.stop('loadcases')
    print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed),
          file=args.logfile)
    elapsed = timer.stop('loadall')
    print('[kevlar::novel] All samples loaded in {:.2f} sec'.format(elapsed),
          file=args.logfile)

    timer.start('iter')
    ncases = len(args.case)
    message = 'Iterating over reads from {:d} case sample(s)'.format(ncases)
    print('[kevlar::novel]', message, file=args.logfile)
    outstream = kevlar.open(args.out, 'w')
    infiles = [f for filelist in args.case for f in filelist]
    caserecords = kevlar.multi_file_iter_screed(infiles)
    readstream = novel(
        caserecords,
        cases,
        controls,
        ksize=args.ksize,
        abundscreen=args.abund_screen,
        casemin=args.case_min,
        ctrlmax=args.ctrl_max,
        numbands=args.num_bands,
        band=myband,
        skipuntil=args.skip_until,
        updateint=args.upint,
        logstream=args.logfile,
    )
    for augmented_read in readstream:
        kevlar.print_augmented_fastx(augmented_read, outstream)

    elapsed = timer.stop('iter')
    message = 'Iterated over all case reads in {:.2f} seconds'.format(elapsed)
    print('[kevlar::novel]', message, file=args.logfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::novel]', message, file=args.logfile)
示例#3
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    assembler = assemble(pstream, maxreads=args.max_reads)
    for partid, contig in assembler:
        kevlar.print_augmented_fastx(contig, outstream)
示例#4
0
def assemble_with_greed(graph, ccindex, debugout=None):
    """Find shortest common superstring using a greedy assembly algorithm."""
    count = 0
    while len(graph.edges()) > 0:
        count += 1

        pair = fetch_largest_overlapping_pair(graph)
        newname = 'contig{:d}:cc={:d}'.format(count, ccindex)
        newrecord = merge_and_reannotate(pair, newname)
        if debugout:
            print('### DEBUG',
                  pair.tail.name,
                  pair.head.name,
                  pair.offset,
                  pair.overlap,
                  pair.sameorient,
                  file=debugout)
            kevlar.print_augmented_fastx(newrecord, debugout)
        for kmer in newrecord.ikmers:
            kmerseq = kevlar.revcommin(kmer.sequence)
            for readname in graph.ikmers[kmerseq]:
                already_merged = readname not in graph
                current_contig = readname in [
                    pair.tail.name, pair.head.name, newname
                ]
                if already_merged or current_contig:
                    continue
                otherrecord = graph.get_record(readname)
                newpair = kevlar.overlap.calc_offset(newrecord, otherrecord,
                                                     kmerseq, debugout)
                if newpair == kevlar.overlap.INCOMPATIBLE_PAIR:
                    continue
                tn, hn = newpair.tail.name, newpair.head.name
                if tn in graph and hn in graph[tn]:
                    assert graph[tn][hn]['overlap'] == newpair.overlap
                    if graph[tn][hn]['tail'] == newpair.tail:
                        assert graph[tn][hn]['offset'] == newpair.offset
                else:
                    graph.add_edge(tn,
                                   hn,
                                   offset=newpair.offset,
                                   overlap=newpair.overlap,
                                   ikmer=kmerseq,
                                   orient=newpair.sameorient,
                                   tail=tn,
                                   swapped=newpair.swapped)
            graph.ikmers[kmerseq].add(newrecord.name)
        graph.add_node(newrecord.name, record=newrecord)
        graph.remove_node(pair.tail.name)
        graph.remove_node(pair.head.name)
示例#5
0
def test_kmer_rep_in_read(capsys):
    from sys import stdout
    read = ('AGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGAT'
            'GAGGATGAGGATGAGGAT')
    record = screed.Record(name='reqseq', sequence=read, ikmers=list())

    k1 = kevlar.KmerOfInterest(sequence='GATGAGGATGAGGATGAGGATGAGG',
                               offset=2,
                               abund=[11, 1, 0])
    k2 = kevlar.KmerOfInterest(sequence='GATGAGGATGAGGATGAGGATGAGG',
                               offset=8,
                               abund=[11, 1, 0])
    record.ikmers.extend([k1, k2])

    kevlar.print_augmented_fastx(record, stdout)
    out, err = capsys.readouterr()
    assert read in out
示例#6
0
def test_augfastx_writer():
    output = StringIO()
    record = Record(
        name='BasiliscusVulgarisRead84467/1',
        sequence='TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT',
        quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB',
        annotations=[
            KmerOfInterest(ksize=19, offset=13, abund=(12, 1, 1)),
            KmerOfInterest(ksize=19, offset=15, abund=(20, 0, 1)),
        ],
    )
    kevlar.print_augmented_fastx(record, output)
    record = Record(
        name='BasiliscusVulgarisRead90577/2',
        sequence='CTGTAATCCCAGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAG',
        quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB',
        annotations=[
            KmerOfInterest(ksize=19, offset=1, abund=(5, 7, 9)),
            KmerOfInterest(ksize=19, offset=2, abund=(7, 10, 9)),
        ],
        mates=['CAGATGTGTCTTGTGGGCAGTGCAGCGGAGAGGTGCAAATATGGGTTTGG']
    )
    kevlar.print_augmented_fastx(record, output)
    record = Record(
        name='BasiliscusVulgarisRead99037/1',
        sequence='AGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAGGATTACAGAT',
        quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB'
    )
    kevlar.print_augmented_fastx(record, output)

    assert output.getvalue() == """@BasiliscusVulgarisRead84467/1
示例#7
0
文件: filter.py 项目: scchess/kevlar
def validate_and_print(readset,
                       countgraph,
                       mask=None,
                       minabund=5,
                       outfile=sys.stdout,
                       augout=None,
                       logfile=sys.stderr):
    readset.validate(countgraph, mask=mask, minabund=minabund)
    n = 0  # Get an unbound var error later (printing report) without this?!?!
    for n, record in enumerate(readset):
        khmer.utils.write_record(record, outfile)
        if augout:
            kevlar.print_augmented_fastx(record, augout)

    int_distinct = readset.masked[0] + readset.lowabund[0] + readset.valid[0]
    int_instances = readset.masked[1] + readset.lowabund[1] + readset.valid[1]

    message = '    processed {:d} instances'.format(int_instances)
    message += ' of {:d} distinct "interesting" k-mers'.format(int_distinct)
    message += ' in {:d} reads'.format(len(readset))
    message += '\n        '
    message += '{:d} instances'.format(readset.masked[1])
    message += ' of {:d} distinct k-mers'.format(readset.masked[0])
    message += ' masked by the reference genome'
    message += '\n        '
    message += '{:d} instances'.format(readset.lowabund[1])
    message += ' of {:d} distinct k-mers'.format(readset.lowabund[0])
    message += ' discarded due to low abundance'
    message += '\n        '
    message += '{:d} instances'.format(readset.valid[1])
    message += ' of {:d} distinct k-mers'.format(readset.valid[0])
    message += ' validated as novel'
    message += '\n        '
    message += '{:d} reads'.format(readset.discarded)
    message += ' with no surviving valid k-mers ignored'
    message += '\n        '
    message += '{:d} reads written to output'.format(n + 1)
    print(message, file=logfile)
示例#8
0
def main(args):
    augseqs = kevlar.parse_augmented_fastx(kevlar.open(args.augseqs, 'r'))
    nakedseqs = kevlar.parse_augmented_fastx(kevlar.open(args.seqs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    for record in augment(augseqs, nakedseqs):
        kevlar.print_augmented_fastx(record, outstream)
示例#9
0
    '--out-pattern',
    metavar='REGEX',
    help='out file name pattern with a {} placeholder for partition ID')
parser.add_argument('augfastx')
parser.add_argument('partition', nargs='+')
args = parser.parse_args()

if args.out and args.out_pattern:
    raise Exception('cannot give outfile and outpattern together')
elif args.out and not args.out_pattern:
    args.out = kevlar.open(args.out, 'w')
elif not args.out and not args.out_pattern:
    args.out = sys.stdout

partids = set(args.partition)
fh = kevlar.open(args.augfastx, 'r')
reader = kevlar.parse_augmented_fastx(fh)
preader = kevlar.parse_partitioned_reads(reader)
for partid, partition in preader:
    if partid not in partids:
        continue
    if args.out_pattern:
        pattern = str(args.out_pattern)
        outfile = pattern.format(partid)
        with kevlar.open(outfile, 'w') as out:
            for read in partition:
                kevlar.print_augmented_fastx(read, out)
    else:
        for read in partition:
            kevlar.print_augmented_fastx(read, args.out)
示例#10
0
def main(args):
    reads = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    outstream = kevlar.open(args.out, 'w')
    for contig in assemble_fml_asm(reads):
        kevlar.print_augmented_fastx(contig, outstream)
示例#11
0
def split(pstream, outstreams):
    """Split the partitions across the N outstreams."""
    for partition, outstream in zip(pstream, cycle(outstreams)):
        for read in partition:
            kevlar.print_augmented_fastx(read, outstream)