Exemplo n.º 1
0
def cutout(contigs, refrseqs, seed_matches, seedsize=51, delta=50,
           maxdiff=None, inclpattern=None, exclpattern=None, debug=False):
    """Compute reference target sequences for a set of partitioned contigs.

    Partition by partition, decompose contigs into seeds, determine the genomic
    location of each seed, calculated the span of all seeds (plus some
    extension delta), and cut out that interval of the genome.
    """
    localizer = kevlar.localize.Localizer(
        seedsize, incl=inclpattern, excl=exclpattern
    )
    for contig in contigs:
        for seed in decompose_seeds(contig.sequence, seedsize):
            seed = kevlar.revcommin(seed)
            if seed not in seed_matches:
                if debug:  # pragma: no cover
                    message = 'WARNING: no position for seed {}'.format(seed)
                    kevlar.plog('[kevlar::localize]', message)
                continue
            for seqid, position in seed_matches[seed]:
                localizer.add_seed_match(seqid, position)
    if maxdiff is None:
        maxcontiglen = max([len(c.sequence) for c in contigs])
        maxdiff = maxcontiglen * 3

    cutter = localizer.get_cutouts(refrseqs=refrseqs, delta=delta,
                                   clusterdist=maxdiff)
    for gdna in cutter:
        yield gdna
Exemplo n.º 2
0
def test_ctrl3(mask, nkmers, nkmerinstances):
    readfile = data_file('trio1/novel_3_1,2.txt')
    ikmers = defaultdict(int)
    for read in kevlar.filter.filter(readfile, memory=1e7, mask=mask):
        for ikmer in read.annotations:
            kmerseq = kevlar.revcommin(read.ikmerseq(ikmer))
            ikmers[kmerseq] += 1
    assert len(ikmers) == nkmers
    assert sum(ikmers.values()) == nkmerinstances
Exemplo n.º 3
0
def get_unique_seeds(recordstream, seedsize):
    """Grab all unique seeds from the specified sequence file."""
    ct = Counttable(seedsize, 1, 1)
    kmers = set()
    for record in recordstream:
        for kmer in ct.get_kmers(record.sequence):
            minkmer = kevlar.revcommin(kmer)
            if minkmer not in kmers:
                kmers.add(minkmer)
                yield kmer
Exemplo n.º 4
0
    def load(self, readstream, minabund=None, maxabund=None, dedup=False):
        """
        Load reads and interesting k-mers into a graph structure.

        A graph node is created for each read, and a set of reads containing
        each interesting k-mer is stored. If abundance thresholds are enforced,
        do a second in-memory pass over the k-mers to discard any that don't
        satisfy the threshold criteria.

        Set `dedup=True` to deduplicate read sequences for handling PCR
        duplicates. This doesn't do a proper check (i.e. check both pairs
        against genome), but simply makes sure that only one copy of each read
        sequence is loaded. This is implemented with a very naive and resource
        intensive approach, so this mode should only be used on small (e.g.
        already partitioned) graphs.
        """
        temp_ikmers = defaultdict(set)
        unique_reads = set()

        for record in readstream:
            if dedup:
                minread = kevlar.revcommin(record.sequence)
                if minread in unique_reads:
                    continue
                unique_reads.add(minread)

            self.add_node(record.name, record=record)
            self.readnames.add(record.name)
            for kmer in record.ikmers:
                kmerseq = kevlar.revcommin(kmer.sequence)
                temp_ikmers[kmerseq].add(record.name)

        if minabund is None and maxabund is None:
            self.ikmers = temp_ikmers
        else:
            for kmer in temp_ikmers:
                readset = temp_ikmers[kmer]
                abund = len(readset)
                minfail = minabund and abund < minabund
                maxfail = maxabund and abund > maxabund
                if not minfail and not maxfail:
                    self.ikmers[kmer] = readset
Exemplo n.º 5
0
    def add(self, newrecord):
        if newrecord.name in self._reads:
            record = self._reads[newrecord.name]
            assert record.sequence == newrecord.sequence
            record.ikmers.extend(newrecord.ikmers)
        else:
            self._reads[newrecord.name] = newrecord
            self._counts.consume(newrecord.sequence)

        self._readcounts[newrecord.name] += 1
        for kmer in newrecord.ikmers:
            minkmer = kevlar.revcommin(kmer.sequence)
            self._ikmercounts[minkmer] += 1
Exemplo n.º 6
0
def test_partition_dedup(capsys):
    infile = kevlar.tests.data_file('dup.augfastq')
    tempdir = tempfile.mkdtemp()

    arglist = ['partition', '--split', tempdir + '/dedup', infile]
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.logstream, logstream = sys.stderr, kevlar.logstream
    kevlar.partition.main(args)
    kevlar.logstream = logstream
    out, err = capsys.readouterr()
    assert 'grouped 16 reads into 1 connected components' in err

    outfile = tempdir + '/dedup.cc1.augfastq.gz'
    stream = kevlar.open(outfile, 'r')
    parser = kevlar.parse_augmented_fastx(stream)
    readseqs = [r.sequence for r in parser]
    uniquereadseqs = set([kevlar.revcommin(s) for s in readseqs])
    testreads = [
        'AACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGG',
        'ACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGT',
        'AGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAAGGTCATCATT',
        'ATCGGAACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGG',
        'CCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGTTAGGT',
        'CCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGTTAGGTGTG',
        'CGCACCATTTGCCGTGGAAGCATAAAGGTCATCATTGAGGTGGTTCGTTC',
        'CGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAAGGTCAT',
        'CGGCTATGGCGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCAT',
        'CTATGGCGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAA',
        'GCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCCGGA',
        'GGAACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGC',
        'GGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCCGGATCGTGGCA',
        'TATGCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCC',
        'TTATGCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGC',
        'TTGGTGCCACGATCCGGCTATGGCGGAAGGGCACACCTAACCGCACCATT',
    ]
    testreadseqs = set([kevlar.revcommin(s) for s in testreads])
    assert uniquereadseqs == testreadseqs

    shutil.rmtree(tempdir)
Exemplo n.º 7
0
def test_filter_abundfilt():
    readfile = data_file('worm.augfasta')
    ikmers = defaultdict(int)
    filt = kevlar.filter.filter(readfile, memory=1000, casemin=5, ctrlmax=0)
    validated = list(filt)
    assert len(validated) == 5

    for read in validated:
        for ikmer in read.annotations:
            kmerseq = kevlar.revcommin(read.ikmerseq(ikmer))
            ikmers[kmerseq] += 1
    assert len(ikmers) == 1
    assert sum(ikmers.values()) == 5
Exemplo n.º 8
0
def get_seed_matches(seedfile, refrfile, seedsize=51):
    """Determine the position of all seeds with a single system call to BWA."""
    kevlar.plog('[kevlar::localize] computing seed matches')
    bwa_cmd = 'bwa mem -k {k} -T {k} -a -c 5000 {idx} {seeds}'.format(
        k=seedsize, idx=refrfile, seeds=seedfile
    )
    bwa_args = bwa_cmd.split()
    seed_index = defaultdict(set)
    for seqid, start, end, seq in bwa_align(bwa_args, seqfilename=seedfile):
        minseq = kevlar.revcommin(seq)
        seed_index[minseq].add((seqid, start))
    message = 'found positions for {} seeds'.format(len(seed_index))
    kevlar.plog('[kevlar::localize]', message)
    return seed_index
Exemplo n.º 9
0
def assemble_with_greed(graph, ccindex, debugout=None):
    """Find shortest common superstring using a greedy assembly algorithm."""
    count = 0
    while len(graph.edges()) > 0:
        count += 1

        pair = fetch_largest_overlapping_pair(graph)
        newname = 'contig{:d}:cc={:d}'.format(count, ccindex)
        newrecord = merge_and_reannotate(pair, newname)
        if debugout:
            print('### DEBUG',
                  pair.tail.name,
                  pair.head.name,
                  pair.offset,
                  pair.overlap,
                  pair.sameorient,
                  file=debugout)
            kevlar.print_augmented_fastx(newrecord, debugout)
        for kmer in newrecord.ikmers:
            kmerseq = kevlar.revcommin(kmer.sequence)
            for readname in graph.ikmers[kmerseq]:
                already_merged = readname not in graph
                current_contig = readname in [
                    pair.tail.name, pair.head.name, newname
                ]
                if already_merged or current_contig:
                    continue
                otherrecord = graph.get_record(readname)
                newpair = kevlar.overlap.calc_offset(newrecord, otherrecord,
                                                     kmerseq, debugout)
                if newpair == kevlar.overlap.INCOMPATIBLE_PAIR:
                    continue
                tn, hn = newpair.tail.name, newpair.head.name
                if tn in graph and hn in graph[tn]:
                    assert graph[tn][hn]['overlap'] == newpair.overlap
                    if graph[tn][hn]['tail'] == newpair.tail:
                        assert graph[tn][hn]['offset'] == newpair.offset
                else:
                    graph.add_edge(tn,
                                   hn,
                                   offset=newpair.offset,
                                   overlap=newpair.overlap,
                                   ikmer=kmerseq,
                                   orient=newpair.sameorient,
                                   tail=tn,
                                   swapped=newpair.swapped)
            graph.ikmers[kmerseq].add(newrecord.name)
        graph.add_node(newrecord.name, record=newrecord)
        graph.remove_node(pair.tail.name)
        graph.remove_node(pair.head.name)
Exemplo n.º 10
0
def get_unique_kmers(recordstream, ksize=31):
    """
    Grab all unique k-mers from the specified sequence file.

    Input is expected to be an iterable containing screed or khmer sequence
    records.
    """
    ct = khmer.Counttable(ksize, 1, 1)
    kmers = set()
    for record in recordstream:
        for kmer in ct.get_kmers(record.sequence):
            minkmer = kevlar.revcommin(kmer)
            if minkmer not in kmers:
                kmers.add(minkmer)
                yield kmer
Exemplo n.º 11
0
def contigs_2_seeds(partstream, seedstream, seedsize=51):
    """Convert a stream of partitioned contigs to seeds and write to a file."""
    message = 'decomposing contigs into seeds of length {}'.format(seedsize)
    kevlar.plog('[kevlar::localize]', message)
    seeds = set()
    for partition in partstream:
        contigs = list(partition)
        for contig in contigs:
            for seed in decompose_seeds(contig.sequence, seedsize):
                seeds.add(kevlar.revcommin(seed))
    n = 0
    for n, seed in enumerate(sorted(seeds)):
        print('>seed{}\n{}'.format(n, seed), file=seedstream)
    seedstream.flush()
    message = 'contigs decomposed into {} seeds'.format(n)
    kevlar.plog('[kevlar::localize]', message)
Exemplo n.º 12
0
    def validate(self, counts, mask=None, minabund=5):
        for readid in self._reads:
            record = self._reads[readid]

            validated_kmers = list()
            for kmer in record.ikmers:
                kmerseq = kevlar.revcommin(kmer.sequence)
                if mask and mask.get(kmerseq) > 0:
                    self._masked[kmerseq] += 1
                elif counts.get(kmerseq) < minabund:
                    self._lowabund[kmerseq] += 1
                else:
                    kmer.abund[0] = counts.get(kmerseq)
                    validated_kmers.append(kmer)
                    self._valid[kmerseq] += 1
            record.ikmers = validated_kmers
            if len(validated_kmers) == 0:
                self._novalidkmers_count += 1
Exemplo n.º 13
0
def load_reads_and_kmers(instream, logstream=None):
    """
    Load reads into lookup tables for convenient access.

    The first table is a dictionary of reads indexed by read name, and the
    second table is a dictionary of read sets indexed by an interesting k-mer.
    """
    reads = dict()
    kmers = defaultdict(set)
    for n, record in enumerate(kevlar.parse_augmented_fastx(instream), 1):
        if logstream and n % 10000 == 0:  # pragma: no cover
            print('[kevlar::seqio]    loaded {:d} reads'.format(n),
                  file=logstream)
        reads[record.name] = record
        for kmer in record.ikmers:
            kmerseq = kevlar.revcommin(kmer.sequence)
            kmers[kmerseq].add(record.name)
    return reads, kmers
Exemplo n.º 14
0
def load_input(filelist, ksize, memory, maxfpr=0.001, logfile=sys.stderr):
    """
    Load input data.

    The input data is loaded into two data structures. First, the read
    sequences are loaded into a countgraph to recompute k-mer abundances with
    (effectively) exact precision. Second, the reads and their corresponding
    "interesting" k-mers are loaded into an AnnotatedReadSet to de-duplicate
    reads and group k-mers by read.
    """
    countgraph = khmer.Countgraph(ksize, memory / 4, 4)
    read_inst_consumed = 0
    int_kmer_instances = 0
    int_kmers_parsed = set()
    readset = kevlar.seqio.AnnotatedReadSet()
    for filename in filelist:
        print('    -', filename, file=logfile)
        with kevlar.open(filename, 'r') as infile:
            for record in kevlar.parse_augmented_fastx(infile):
                if record.name not in readset._reads:
                    countgraph.consume(record.sequence)
                readset.add(record)
                read_inst_consumed += 1
                for kmer in record.ikmers:
                    int_kmer_instances += 1
                    minkmer = kevlar.revcommin(kmer.sequence)
                    int_kmers_parsed.add(minkmer)
    n_kmers_distinct = len(int_kmers_parsed)

    fpr = kevlar.sketch.estimate_fpr(countgraph)
    message = '    {:d} instances'.format(read_inst_consumed)
    message += ' of {:d} reads consumed'.format(len(readset))
    message += ', annotated with {:d} instances '.format(int_kmer_instances)
    message += 'of {:d} distinct "interesting" k-mers'.format(n_kmers_distinct)
    message += '; estimated false positive rate is {:1.3f}'.format(fpr)
    print(message, file=logfile)
    if fpr > maxfpr:
        print('[kevlar::filter] FPR too high, bailing out', file=logfile)
        sys.exit(1)
    return readset, countgraph
Exemplo n.º 15
0
 def add_kmer(self, kmer, read_id):
     min_kmer = kevlar.revcommin(kmer)
     self.kmers[min_kmer].add(read_id)
     self._kmer_instances += 1
Exemplo n.º 16
0
def novel(casestream,
          casecounts,
          controlcounts,
          ksize=31,
          abundscreen=None,
          casemin=5,
          ctrlmax=0,
          numbands=None,
          band=None,
          skipuntil=None):
    numbands_unset = not numbands
    band_unset = not band and band != 0
    if numbands_unset is not band_unset:
        raise ValueError('Must specify `numbands` and `band` together')

    if band is not None and band < 0:
        maxband = numbands - 1
        message = '`band` must be a value between 0 and {:d}'.format(maxband)
        message += ' (`numbands` - 1), inclusive'
        raise ValueError(message)

    timer = kevlar.Timer()
    timer.start()
    nkmers = 0
    nreads = 0
    update_message = '[kevlar::novel]     processed {counter} reads'
    skip_message = None
    if skipuntil:
        msg = '; skipping reads in search of {read}'.format(read=skipuntil)
        skip_message = update_message + msg
    first_message = skip_message if skipuntil else update_message
    progress_indicator = kevlar.ProgressIndicator(
        first_message,
        interval=1e6,
        breaks=[1e7, 1e8, 1e9],
        usetimer=True,
    )
    unique_kmers = set()
    for n, record in enumerate(casestream, 1):
        progress_indicator.update()
        if skipuntil:  # pragma: no cover
            if record.name == skipuntil:
                message = 'Found read {:s}'.format(skipuntil)
                message += ' (skipped {:d} reads)'.format(n)
                kevlar.plog('[kevlar::novel]', message)
                skipuntil = False
                progress_indicator.message = update_message
            continue

        if len(record.sequence) < ksize:
            continue
        if re.search('[^ACGT]', record.sequence):
            # This check should be temporary; hopefully khmer will handle
            # this soon.
            continue

        discard_read = False
        irecord = None
        for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)):
            if numbands:
                khash = casecounts[0].hash(kmer)
                if khash & (numbands - 1) != band - 1:
                    continue
            interesting, discard, caseabund, ctrlabund = kmer_is_interesting(
                kmer,
                casecounts,
                controlcounts,
                case_min=casemin,
                ctrl_max=ctrlmax,
                screen_thresh=abundscreen,
            )
            if discard:
                discard_read = True
                break
            if not interesting:
                continue
            if irecord is None:
                irecord = kevlar.sequence.copy_record(record)
            abund = tuple(caseabund + ctrlabund)
            irecord.annotate(kmer, i, abund)
            minkmer = kevlar.revcommin(kmer)
            unique_kmers.add(minkmer)

        if discard_read or irecord is None:
            continue

        nreads += 1
        nkmers += len(irecord.annotations)
        yield irecord

    elapsed = timer.stop()
    message = 'Found {:d} instances'.format(nkmers)
    message += ' of {:d} unique novel kmers'.format(len(unique_kmers))
    message += ' in {:d} reads'.format(nreads)
    message += ' in {:.2f} seconds'.format(elapsed)
    kevlar.plog('[kevlar::novel]', message)
Exemplo n.º 17
0
def novel(casestream,
          casecounts,
          controlcounts,
          ksize=31,
          abundscreen=None,
          casemin=5,
          ctrlmax=0,
          numbands=None,
          band=None,
          skipuntil=None,
          updateint=10000,
          logstream=sys.stderr):
    numbands_unset = not numbands
    band_unset = not band and band != 0
    if numbands_unset is not band_unset:
        raise ValueError('Must specify `numbands` and `band` together')

    if band is not None and band < 0:
        maxband = numbands - 1
        message = '`band` must be a value between 0 and {:d}'.format(maxband)
        message += ' (`numbands` - 1), inclusive'
        raise ValueError(message)

    timer = kevlar.Timer()
    timer.start()
    nkmers = 0
    nreads = 0
    nextupdate = updateint
    unique_kmers = set()
    for n, record, mate in kevlar.paired_reader(casestream):
        if skipuntil:  # pragma: no cover
            if record.name == skipuntil:
                message = 'Found read {:s}'.format(skipuntil)
                message += ' (skipped {:d} reads)'.format(n)
                print('[kevlar::novel]', message, file=logstream)
                skipuntil = False
            continue
        if n >= nextupdate:
            nextupdate += updateint
            elapsed = timer.probe()
            msg = '    processed {} reads'.format(n)
            msg += ' in {:.2f} seconds...'.format(elapsed)
            print(msg, file=logstream)
        if len(record.sequence) < ksize:
            continue
        if re.search('[^ACGT]', record.sequence):
            # This check should be temporary; hopefully khmer will handle
            # this soon.
            continue

        discard_read = False
        irecord = None
        for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)):
            if numbands:
                khash = casecounts[0].hash(kmer)
                if khash & (numbands - 1) != band - 1:
                    continue
            interesting, discard, caseabund, ctrlabund = kmer_is_interesting(
                kmer,
                casecounts,
                controlcounts,
                case_min=casemin,
                ctrl_max=ctrlmax,
                screen_thresh=abundscreen,
            )
            if discard:
                discard_read = True
                break
            if not interesting:
                continue
            if irecord is None:
                irecord = kevlar.sequence.copy_record(record)
            abund = tuple(caseabund + ctrlabund)
            irecord.annotate(kmer, i, abund)
            minkmer = kevlar.revcommin(kmer)
            unique_kmers.add(minkmer)

        if discard_read or irecord is None:
            continue

        nreads += 1
        nkmers += len(irecord.annotations)
        if mate:
            irecord.add_mate(mate.sequence)
        yield irecord

    elapsed = timer.stop()
    message = 'Found {:d} instances'.format(nkmers)
    message += ' of {:d} unique novel kmers'.format(len(unique_kmers))
    message += ' in {:d} reads'.format(nreads)
    message += ' in {:.2f} seconds'.format(elapsed)
    print('[kevlar::novel]', message, file=logstream)
Exemplo n.º 18
0
 def add_contig(self, contig, kmer):
     min_contig = kevlar.revcommin(contig)
     min_kmer = kevlar.revcommin(kmer)
     self.contigs[min_contig].add(min_kmer)
Exemplo n.º 19
0
cli = argparse.ArgumentParser()
cli.add_argument('--sketch-type',
                 metavar='T',
                 choices=allocators.keys(),
                 default='counttable',
                 help='Sketch type to use for output')
cli.add_argument('--num-tables', type=int, default=4, metavar='N')
cli.add_argument('--table-size', type=int, default=1000, metavar='X')
cli.add_argument('sketch', help='original sketch')
cli.add_argument('subsketch', help='new sketch to create')
cli.add_argument('sequence', nargs='+', help='sequences to sample from sketch')
args = cli.parse_args()

sketch = kevlar.sketch.load(args.sketch)
allocfunc = allocators[args.sketch_type]
subsketch = allocfunc(sketch.ksize(), args.table_size, args.num_tables)

kmers = set()
for seq in args.sequence:
    for kmer in sketch.get_kmers(seq):
        minkmer = kevlar.revcommin(kmer)
        kmers.add(minkmer)
for kmer in kmers:
    count = sketch.get(kmer)
    for _ in range(count):
        subsketch.add(kmer)

subsketch.save(args.subsketch)
fpr = khmer.calc_expected_collisions(subsketch, max_false_pos=100.0)
print('Estimated FPR: {:.4f}'.format(fpr))