def test_lookup_set(): x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0) q = set(range(10)) y = x.lookup_many(q) for i in y: assert i is not None
def test_all(tmpdir): x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0) assert x.lookup(9) == 8 output = tmpdir.join('xxx') x.save(str(output)) y = bbhash.load_mphf(str(output)) assert y.lookup(9) == 8
def build_mphf(kh, records_iter_fn): # build a list of all k-mers in the cDBG all_kmers = list() records_iter = records_iter_fn() for n, record in enumerate(records_iter): if n % 50000 == 0 and n: print('... contig', n, end='\r') kmers = kh.get_kmer_hashes(record.sequence) all_kmers.extend(list(kmers)) n_contigs = n + 1 print('loaded {} contigs.\n'.format(n_contigs)) # build MPHF (this is the CPU intensive bit) print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n_contigs)) x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0) # build tables linking: # * mphf hash to k-mer hash (for checking exactness) # * mphf hash to cDBG ID # * cDBG ID to node size (in k-mers) mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64) mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32) sizes = numpy.zeros(n_contigs, numpy.uint32) print('second pass.') records_iter = records_iter_fn() for n, record in enumerate(records_iter): if n % 50000 == 0 and n: print('... contig {} of {}'.format(n, n_contigs), end='\r') # node ID is record name, must go from 0 to total-1 cdbg_id = int(record.name) # get 64-bit numbers for each k-mer (doesn't really matter what hash) kmers = kh.get_kmer_hashes(record.sequence) # for each k-mer, find its MPHF hashval, & link to info. for kmer in kmers: mphf = x.lookup(kmer) mphf_to_kmer[mphf] = kmer mphf_to_cdbg[mphf] = cdbg_id # record each node size, while we're here. sizes[cdbg_id] = len(kmers) print('loaded {} contigs in pass2.\n'.format(n_contigs)) assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg)) return x, mphf_to_kmer, mphf_to_cdbg, sizes
def main(argv): p = argparse.ArgumentParser() p.add_argument('catlas_prefix') p.add_argument('-k', '--ksize', default=31, type=int) a = p.parse_args(argv) kh = khmer.Nodetable(a.ksize, 1, 1) contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz') mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf') array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices') # build a list of all k-mers in the cDBG all_kmers = list() print('reading cDBG nodes from {}'.format(contigs_filename)) for n, record in enumerate(screed.open(contigs_filename)): if n % 50000 == 0 and n: print('... contig', n, end='\r') kmers = kh.get_kmer_hashes(record.sequence) all_kmers.extend(list(kmers)) n_contigs = n + 1 print('loaded {} contigs.\n'.format(n_contigs)) # build MPHF (this is the CPU intensive bit) print('building MPHF for {} k-mers in {} nodes.'.format( len(all_kmers), n_contigs)) x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0) # build tables linking: # * mphf hash to k-mer hash (for checking exactness) # * mphf hash to cDBG ID # * cDBG ID to node size (in k-mers) mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64) mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32) sizes = numpy.zeros(n_contigs, numpy.uint32) print('second pass; reading cDBG nodes from {}'.format(contigs_filename)) for n, record in enumerate(screed.open(contigs_filename)): if n % 50000 == 0 and n: print('... contig {} of {}'.format(n, n_contigs), end='\r') # node ID is record name, must go from 0 to total-1 cdbg_id = int(record.name) # get 64-bit numbers for each k-mer (doesn't really matter what hash) kmers = kh.get_kmer_hashes(record.sequence) # for each k-mer, find its MPHF hashval, & link to info. for kmer in kmers: mphf = x.lookup(kmer) mphf_to_kmer[mphf] = kmer mphf_to_cdbg[mphf] = cdbg_id # record each node size, while we're here. sizes[cdbg_id] = len(kmers) print('loaded {} contigs in pass2.\n'.format(n_contigs)) assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg)) print('done! saving to {} and {}'.format(mphf_filename, array_filename)) x.save(mphf_filename) with open(array_filename, 'wb') as fp: numpy.savez_compressed(fp, mphf_to_kmer=mphf_to_kmer, kmer_to_cdbg=mphf_to_cdbg, sizes=sizes)
def main(): parser = argparse.ArgumentParser() parser.add_argument('unitigs') parser.add_argument('transcriptomes', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-o', '--output') args = parser.parse_args() assert args.output kh = khmer.Nodetable(args.ksize, 1, 1) all_kmers = [] for n, record in enumerate(screed.open(args.unitigs)): if n % 10000 == 0: print('... cdbg', n) if n > 20000 and 0: break all_kmers.extend(kh.get_kmer_hashes(record.sequence)) print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n)) x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0) ### mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64) mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32) for n, record in enumerate(screed.open(args.unitigs)): if n % 10000 == 0: print('... cdbg', n) if n > 20000 and 0: break cdbg_id = int(record.name.split(' ')[0]) kmers = kh.get_kmer_hashes(record.sequence) for kmer in kmers: mphf = x.lookup(kmer) mphf_to_kmer[mphf] = kmer mphf_to_cdbg[mphf] = cdbg_id ### print('walking the transcriptome') family_ids = {} family_counter = 0 cdbg_to_family_id = defaultdict(set) n = 0 for tr_filename in args.transcriptomes: for record in screed.open(tr_filename): n += 1 if n % 1000 == 0: print('...', tr_filename, n) if n > 5000 and 0: break # get the family name family_name = record.name.split('|')[1] # convert to family ID, generating a new one if we need one family_id = family_ids.get(family_name) if family_id is None: family_id = family_counter family_counter += 1 family_ids[family_name] = family_id # for all k-mers, hashvals = kh.get_kmer_hashes(record.sequence) for hashval in hashvals: # find cDBG ID mphf = x.lookup(hashval) if mphf is None: continue assert mphf is not None cdbg_id = mphf_to_cdbg[mphf] # link cDBG ID to family ID cdbg_to_family_id[cdbg_id].add(family_id) mphf_filename = args.output + '.mphf' array_filename = args.output + '.arr' x.save(mphf_filename) with open(array_filename, 'wb') as fp: pickle.dump( (mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id), fp)
def test_construct_from_set(): with pytest.raises(TypeError): # CTB: could fix this. x = bbhash.PyMPHF(set(range(10)), 10, 1, 1.0)
def test_lookup(): x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0) assert all(x.lookup(y) is not None for y in range(10)) assert x.lookup(200) is None