Exemplo n.º 1
0
def assemble_jca(readstream,
                 memory,
                 maxfpr=0.01,
                 collapse=True,
                 kmers_to_ignore=set(),
                 logstream=sys.stderr):
    print('[kevlar::assemble::jca] loading reads', file=logstream)
    countgraph = None
    variants = kevlar.VariantSet()
    for record in readstream:
        for kmer in record.ikmers:
            variants.add_kmer(kmer.sequence, record.name)
            if countgraph is None:
                ksize = len(kmer.sequence)
                countgraph = khmer.Countgraph(ksize, memory / 4, 4)
        countgraph.consume(record.sequence)
    fpr = kevlar.sketch.estimate_fpr(countgraph)
    msg = '[kevlar::assemble::jca]    done loading reads'
    msg += ', {:d} distinct k-mers stored'.format(countgraph.n_unique_kmers())
    msg += '; estimated false positive rate is {:1.3f}'.format(fpr)
    if fpr > maxfpr:
        msg += ' (FPR too high, bailing out!!!)'
        raise kevlar.sketch.KevlarUnsuitableFPRError(msg)
    print(msg, file=logstream)

    asm = khmer.JunctionCountAssembler(countgraph)
    for kmer in variants.kmers:
        if kmer in kmers_to_ignore:
            continue
        contigs = asm.assemble(kmer)
        for contig in contigs:
            if hasattr(contig, 'decode'):
                contig = contig.decode()
            if contig == '':
                print('    WARNING: no assembly found for k-mer',
                      kmer,
                      file=args.logfile)
                continue
            variants.add_contig(contig, kmer)

    print('    {:d} linear paths'.format(variants.ncontigs), file=logstream)

    if collapse:
        print('[kevlar::assemble::jca] Collapsing contigs', file=logstream)
        variants.collapse()
        print('    {:d} collapsed contigs'.format(variants.ncontigs),
              file=logstream)

    for n, contigdata in enumerate(variants, 1):
        contig, contigrc, kmers, reads = contigdata
        contigname = 'contig{:d}:length={:d}:nkmers={:d}:nreads={:d}'.format(
            n, len(contig), len(kmers), len(reads))
        contig = screed.Record(name=contigname, sequence=contig)
        yield contig
Exemplo n.º 2
0
    def test_beginning_to_end_across_tip(self, right_tip_structure):
        # assemble entire contig, ignoring branch point b/c of labels
        graph, contig, L, HDN, R, tip = right_tip_structure
        asm = khmer.JunctionCountAssembler(graph)
        asm.consume(contig)
        asm.consume(contig)
        asm.consume(contig)

        path = asm.assemble(contig[:K])
        print('P:', path[0])
        print('T:', tip)
        print('C:', contig)
        assert len(path) == 1, "there should only be one path"
        path = path[0]  # @CTB

        assert len(path) == len(contig)
        assert utils._equals_rc(path, contig)
Exemplo n.º 3
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('--prefix', default='transcriptome')
    args = p.parse_args()

    cg = create_countgraph(args)
    asm = khmer.JunctionCountAssembler(cg)

    tr_fn = '{0}.transcripts.fa'.format(args.prefix)
    orf_fn = '{0}.orfs.fa'.format(args.prefix)
    stats_fn = '{0}.stats.fa'.format(args.prefix)

    with open(tr_fn, 'w') as tr_fp,\
         open(orf_fn, 'w') as orf_fp,\
         open(stats_fn, 'w') as stats_fp:

        kept = 0
        next_contig = 1
        next_orf = 1
        output = set()
        statswriter = csv.DictWriter(
            stats_fp,
            delimiter=',',
            fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n'])

        for filename in args.fastq_files:
            for n, record in enumerate(screed.open(filename)):
                if n and n % 10000 == 0:
                    print('...', n, file=sys.stderr)

                if len(record.sequence) < args.ksize:
                    continue

                cov, _, _ = cg.get_median_count(record.sequence)
                if cov < 20:
                    kept += 1
                    cg.consume(record.sequence)
                    statswriter.writerow({
                        'read_n': n,
                        'action': 'c',
                        'cov': cov,
                        'n_junctions': None,
                        'contig_n': None
                    })
                elif cov < 30:
                    seq, pos = cg.trim_on_abundance(record.sequence, 3)
                    if len(seq) < args.ksize:
                        continue

                    n_junctions = asm.consume(seq)
                    statswriter.writerow({
                        'read_n': n,
                        'action': 't',
                        'cov': cov,
                        'n_junctions': n_junctions,
                        'contig_n': None
                    })
                elif cov == 30:
                    contigs = asm.assemble(record.sequence[:args.ksize])
                    for contig_n, contig in enumerate(contigs):
                        statswriter.writerow({
                            'read_n':
                            n,
                            'action':
                            'a',
                            'cov':
                            cov,
                            'n_junctions':
                            None,
                            'contig_n': (next_contig, contig_n)
                        })
                        tr_fp.write('>contig%d\n%s\n' % (next_contig, contig))
                        next_contig += 1

                        for t in translate(contig):
                            for orf_n, o in enumerate(extract_orfs(t)):
                                if hash(o) not in output:
                                    new = True
                                    output.add(hash(o))
                                    orf_fp.write('>orf%d\n%s\n' %
                                                 (next_orf, o))
                                    next_orf += 1
                                else:
                                    new = False
                else:
                    statswriter.writerow({
                        'read_n': n,
                        'action': 's',
                        'cov': cov,
                        'n_junctions': None,
                        'contig_n': None
                    })