def test_3_tables(): x = list(PRIMES_1m) x.append(1000005) hi = khmer._Countgraph(12, x) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777 collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603 collision_3 = 'AAACGTATCGAG' assert khmer.forward_hash(collision_3, 12) == 184755 # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) # hash(GG) % 1000005 == hash(collision_3) hi.consume(GG) assert hi.get(GG) == 1 hi.consume(collision_1) assert hi.get(GG) == 1 hi.consume(collision_2) assert hi.get(GG) == 1 hi.consume(collision_3) assert hi.get(GG) == 2
def test_3_tables(): x = list(PRIMES_1m) x.append(1000005) hi = khmer._new_counting_hash(12, x) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777L collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603L collision_3 = 'AAACGTATCGAG' assert khmer.forward_hash(collision_3, 12) == 184755L # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) # hash(GG) % 1000005 == hash(collision_3) hi.consume(GG) assert hi.get(GG) == 1 hi.consume(collision_1) assert hi.get(GG) == 1 hi.consume(collision_2) assert hi.get(GG) == 1 hi.consume(collision_3) assert hi.get(GG) == 2
def test_forward_hash(): assert khmer.forward_hash('AAAA', 4) == 0 assert khmer.forward_hash('TTTT', 4) == 0 assert khmer.forward_hash('CCCC', 4) == 170 assert khmer.forward_hash('GGGG', 4) == 170 h = 13607885392109549066 assert khmer.forward_hash('GGTTGACGGGGCTCAGGGGGCGGCTGACTCCG', 32) == h
def get_neighbors(kmer_hash, K): neighbors = [] kmer = khmer.reverse_hash(kmer_hash, K) begin = kmer[0:len(kmer)-1] end = kmer[1:len(kmer)] for base in bases: neighbors.append(khmer.forward_hash(base + begin, K)) neighbors.append(khmer.forward_hash(end + base, K)) return set(neighbors)
def get_neighbors(kmer_hash, K): neighbors = [] kmer = khmer.reverse_hash(kmer_hash, K) begin = kmer[0:len(kmer) - 1] end = kmer[1:len(kmer)] for base in bases: neighbors.append(khmer.forward_hash(base + begin, K)) neighbors.append(khmer.forward_hash(end + base, K)) return set(neighbors)
def test_failed_get(self): GG = 'G' * 12 # forward_hash: 11184810 GGhash = khmer.forward_hash(GG, 12) assert khmer.forward_hash(GG, 12) == 11184810 hi = self.hi hi.consume(GG) try: hi.get(float(GGhash)) assert "the previous statement should fail" except ValueError as err: print(str(err))
def explore(ht, start_kmer, K): edges = set() discovered = set() explored = set() hash_ids = {} start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(khmer.reverse_hash(start_kmer_hash, K)): discovered.add(start_kmer_hash) hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1 else: return hash_ids, edges while (len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get( khmer.reverse_hash(neigh_hash, K) ) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) hash_ids[neigh_hash] = len(hash_ids.keys()) + 1 edges.add( tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) elif ht.get(khmer.reverse_hash( neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered): edges.add( tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) return hash_ids, edges
def explore(ht, start_kmer, K): edges = set() discovered = set() explored = set() hash_ids = {} start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(khmer.reverse_hash(start_kmer_hash, K)): discovered.add(start_kmer_hash) hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1 else: return hash_ids, edges while(len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) hash_ids[neigh_hash] = len(hash_ids.keys()) + 1 edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) elif ht.get(khmer.reverse_hash(neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered): edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) return hash_ids, edges
def test_collision_2(self): GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777L collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603L # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) hi = self.hi hi.consume(GG) hi.consume(collision_2) assert hi.get(GG) == 1
def test_collision_2(self): GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777 collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603 # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) hi = self.hi hi.consume(GG) hi.consume(collision_2) assert hi.get(GG) == 1
def test_collision_1(self): kt = khmer.new_ktable(12) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777L collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603L # note, hash(GG) % 1000003 == hash(collision_1) # note, hash(GG) % 1009837 == hash(collision_2) hi = self.hi hi.consume(GG) hi.consume(collision_1) assert hi.get(GG) == 1
def test_n_occupied_args(self): assert self.kh.n_occupied() == 0 self.kh.consume('AAAA') assert self.kh.n_occupied(0, 1) == 1 assert self.kh.n_occupied(1, 4**4) == 0 hashvalue = khmer.forward_hash('AACT', 4) self.kh.consume('AACT') assert self.kh.n_occupied(0, hashvalue + 1) == 2 assert self.kh.n_occupied(hashvalue + 1, 4**4) == 0 assert self.kh.n_occupied(hashvalue, hashvalue + 1) == 1
def test_n_occupied_args(self): assert self.kh.n_occupied() == 0 self.kh.consume('AAAA') assert self.kh.n_occupied(0, 1) == 1 assert self.kh.n_occupied(1, 4 ** 4) == 0 hashvalue = khmer.forward_hash('AACT', 4) self.kh.consume('AACT') assert self.kh.n_occupied(0, hashvalue + 1) == 2 assert self.kh.n_occupied(hashvalue + 1, 4 ** 4) == 0 assert self.kh.n_occupied(hashvalue, hashvalue + 1) == 1
def test_kmer_neighbors(): inpfile = utils.get_test_data('all-A.fa') nodegraph = khmer._Nodegraph(4, [3, 5]) nodegraph.consume_fasta(inpfile) h = khmer.forward_hash('AAAA', 4) print(type('AAAA')) assert nodegraph.neighbors(h) == [0, 0] # AAAA on both sides assert nodegraph.neighbors('AAAA') == [0, 0] # AAAA on both sides h = khmer.forward_hash('AAAT', 4) assert nodegraph.neighbors(h) == [0] # AAAA on one side assert nodegraph.neighbors('AAAT') == [0] # AAAA on one side h = khmer.forward_hash('AATA', 4) assert nodegraph.neighbors(h) == [] # no neighbors assert nodegraph.neighbors('AATA') == [] # AAAA on one side h = khmer.forward_hash('TAAA', 4) assert nodegraph.neighbors(h) == [0] # AAAA on both sides assert nodegraph.neighbors('TAAA') == [0] # AAAA on both sides
def get_all_kmers(ht, start_kmer, K, ht2, degs): q = list() start_kmer_hash = khmer.forward_hash(start_kmer, K) if not ht2.get(start_kmer_hash): ht2.count(start_kmer) else: return ht2, degs neighs = find_neighbors(start_kmer, ht) degs = add_deg(degs, len(neighs)) for neigh in neighs: neigh_hash = khmer.forward_hash(neigh, K) if not ht2.get(neigh): q.append(neigh_hash) ht2.count(neigh) counter = 0 while len(q) != 0: counter += 1 kmer_hash = q.pop() kmer = khmer.reverse_hash(kmer_hash, K) neighs = find_neighbors(kmer, ht) degs = add_deg(degs, len(neighs)) for neigh in neighs: neigh_hash = khmer.forward_hash(neigh, K) if not ht2.get(neigh): q.append(neigh_hash) ht2.count(neigh)
def gen_graph(filename, edges, hash_ids, chr, K): fd = open(filename, "w") fd.write("graph x {\nsize=\"16, 16\";\n") fd.write("node [ color = red, fontcolor = black, style = filled ];\n") for i in range(len(chr) - K): kmer = chr[i:i + K] kmer_hash = khmer.forward_hash(kmer, K) hash_id = hash_ids[kmer_hash] fd.write("N" + str(hash_id) + " [color = black, fontcolor = white];\n") for edge in edges: fd.write("N" + str(edge[0]) + " -- " + "N" + str(edge[1]) + ";\n") fd.write("}") fd.close()
def explore(ht, start_kmer, K): discovered = set() explored = set() start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(kmer): discovered.add(start_kmer_hash) else: return 0 while(len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) return len(explored)
def test_consume_fasta_and_tag_with_labels(): lb = GraphLabels(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, _ = lb.consume_fasta_and_tag_with_labels(filename) print("doing get") assert lb.graph.get(read_1[:20]) assert total_reads == 3 print("doing n_labels") print(lb.n_labels()) print("doing all labels") print(lb.get_all_labels()) print("get tagset") for tag in lb.graph.get_tagset(): print("forward hash") print(tag, khmer.forward_hash(tag, 20)) for record in screed.open(filename): print("Sweeping tags") print(lb.sweep_tag_neighborhood(record.sequence, 40)) print("Sweeping labels...") print(lb.sweep_label_neighborhood(record.sequence, 40)) assert lb.n_labels() == 3
def test_consume_fasta_and_tag_with_labels(): lb = LabelHash(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) print "doing get" assert lb.get(read_1[:20]) assert total_reads == 3 print "doing n_labels" print lb.n_labels() print "doing label dict" print lb.get_label_dict() print "get tagset" for tag in lb.get_tagset(): print "forward hash" print tag, khmer.forward_hash(tag, 20) for record in screed.open(filename): print "Sweeping tags" print lb.sweep_tag_neighborhood(record.sequence, 40) print "Sweeping labels..." print lb.sweep_label_neighborhood(record.sequence, 40) assert lb.n_labels() == 3
def test_consume_seqfile_and_tag_with_labels(): lb = GraphLabels(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, _ = lb.consume_seqfile_and_tag_with_labels(filename) print("doing get") assert lb.graph.get(read_1[:20]) assert total_reads == 3 print("doing n_labels") print(lb.n_labels()) print("doing all labels") print(lb.get_all_labels()) print("get tagset") for tag in lb.graph.get_tagset(): print("forward hash") print(tag, khmer.forward_hash(tag, 20)) for record in screed.open(filename): print("Sweeping tags") print(lb.sweep_tag_neighborhood(record.sequence, 40)) print("Sweeping labels...") print(lb.sweep_label_neighborhood(record.sequence, 40)) assert lb.n_labels() == 3
x<<=1 return v if __name__ == '__main__': k=21 alphabet={0:'A',1:'T',2:'G',3:'C'} given_string='' for i in range(10000): given_string+=alphabet[random.randint(0,3)] H = HyperLogLog(8) d=defaultdict(int) for i in range(len(given_string)-k+1): d[given_string[i:i+k]] += 1 H.add(khmer.forward_hash(given_string[i:i+k],k)) print 'Real:',len(d) print 'HyperLogLog(murmur3):', H.cardinality()
def test_forward_hash(): assert khmer.forward_hash('AAAA', 4) == 0 assert khmer.forward_hash('TTTT', 4) == 0 assert khmer.forward_hash('CCCC', 4) == 170 assert khmer.forward_hash('GGGG', 4) == 170
def test_forward_hash(): assert khmer.forward_hash("AAAA", 4) == 0 assert khmer.forward_hash("TTTT", 4) == 0 assert khmer.forward_hash("CCCC", 4) == 170 assert khmer.forward_hash("GGGG", 4) == 170
def consume(self,sequence): num_kmers = len(sequence) - self.k + 1 for i in range(num_kmers): self.add(khmer.forward_hash(sequence[i:i+self.k],self.k))
def add(self, kmer): idx = khmer.forward_hash(kmer, self.prefixsize) E = self.sketches[idx] hash = khmer.hash_murmur3(kmer) E.add(hash)