def test_bytes_murmur(): x = hash_murmur("ACG") assert x == 1731421407650554201 x = hash_murmur(b"ACG") assert x == 1731421407650554201 x = hash_murmur(u"ACG") assert x == 1731421407650554201
def test_murmur(): x = hash_murmur("ACG") assert x == 1731421407650554201 try: x = hash_murmur() assert 0, "hash_murmur requires an argument" except TypeError: pass x = hash_murmur("ACG", 42) assert x == 1731421407650554201 y = hash_murmur("ACG", 43) assert y != x
def compute_matrix(group_info, group_ident, ksize, output): # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(ksize) all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. print('creating', len(group_info), 4**ksize) V = numpy.zeros((len(group_info), 4**ksize), dtype=numpy.uint16) node_id_to_group_idx = {} for i, n in enumerate(group_info): if i % 1000 == 0: print('...', i, len(group_info)) mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes] vec = numpy.array(vec) V[i] = vec node_id_to_group_idx[n] = i # save! print('saving matrix of size {} to {}'.format(str(V.shape), output)) with open(output, 'wb') as fp: numpy.save(fp, V) with open(output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)
def hash_sequence(seqstr, input_type, ksize, alphabet, skipinfo=None): hashes = [] # modify sequence if needed based on alphabet (e.g. protein --> dayhoff) # NOT nucl-> protein translation # hmm.. do this by kmer so we can revcomp in nucleotide space, then translate. Otherwise complement doesn't make sense #reencoded_seq = reencode_sequence(seqstr, input_type, alphabet) # check that we can kmerize? if len(seqstr) < ksize: return hashes for fwd_kmer in kmers(seqstr, ksize, skipinfo): if input_type == "nucleotide": # for nucleotide input, get reverse-complement, select smaller kmer rev_kmer = enc.reverse(enc.complement(fwd_kmer)) if fwd_kmer < rev_kmer: # just a consistent way to choose a kmer, right? kmer = fwd_kmer else: kmer = rev_kmer else: # protein input, no need to revcomp kmer = fwd_kmer # tranlate, then hash translated_kmer = reencode_sequence(kmer, input_type, alphabet) #print(f"orig: {kmer}") #print(f"trans: {translated_kmer}") hash = hash_murmur(translated_kmer) if hash < 0: hash += 2**64 hashes += [hash] #yield hash return hashes
def main(): parser = argparse.ArgumentParser() parser.add_argument('genomes', nargs='+') parser.add_argument('-o', '--output') parser.add_argument('-k', '--ksize', default=5, type=int, help='k-mer size for vectors') args = parser.parse_args() assert args.output, "please specify -o" n = 0 genome_n = 0 group_info = {} group_ident = {} labels = {} node_id_to_group_idx = {} for genome in args.genomes: print(genome) genome_n += 1 for record in screed.open(genome): for start in range(0, len(record.sequence), SIZE): mh = sourmash.MinHash(n=0, ksize=args.ksize, scaled=1, track_abundance=1) mh.add_sequence(record.sequence[start:start+SIZE], True) group_info[n] = mh mh = sourmash.MinHash(n=0, ksize=31, scaled=1000) mh.add_sequence(record.sequence[start:start+SIZE], True) group_ident[n] = mh labels[n] = genome_n node_id_to_group_idx[n] = n n += 1 # ok, now we have a pile of k-mer vectors of size 4**args.ksize; # output in numpy format. # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(args.ksize) all_kmer_hashes = list(set([ hash_murmur(i) for i in all_kmers ])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16) for i, n in enumerate(group_info): mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [ vec.get(hashval,0) for hashval in all_kmer_hashes ] vec = numpy.array(vec) V[i] = vec # save! print('saving matrix of size {} to {}'.format(str(V.shape), args.output)) with open(args.output, 'wb') as fp: numpy.save(fp, V) with open(args.output + '.labels', 'wb') as fp: dump(labels, fp) with open(args.output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(args.output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)