def test_inexact_search(): for config in CONFIGS: get_storage(config).delete_all() config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ATACACAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) for config in CONFIGS: get_storage(config).delete_all() with pytest.raises(BaseException): BIGSI(config) bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"]) assert bigsi.search("ACAGTTAAC", 0.5) == [] assert bigsi.lookup("AAT") == {"AAT": bitarray("10")} results = bigsi.search("ATACACAAT", 0.5) assert results[0] == { "percent_kmers_found": 100.0, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a", } assert ( json.dumps(results[0]) == '{"percent_kmers_found": 100.0, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a"}' ) assert results[1] == { "percent_kmers_found": 83.33, "num_kmers": 6, "num_kmers_found": 5, "sample_name": "b", } bigsi.delete()
def test_jaccard_index2(mc, kmers1, kmers2): kmers1 = list(seq_to_kmers(kmers1)) kmers2 = list(seq_to_kmers(kmers2)) mc.delete_all() mc.insert(kmers1, '1234') mc.insert(kmers2, '1235') skmers1 = set(kmers1) skmers2 = set(kmers2) true_sim = float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2)) ji = mc.jaccard_index('1234', '1235') assert float(abs(ji-true_sim)) <= 0.2
def test_get_bloomfilter(sample, seq): kmers = seq_to_kmers(seq, 31) bigsi = BIGSI.create(m=100, force=True) bigsi.insert(bigsi.bloom(kmers), sample) bf = bigsi.get_bloom_filter(sample) assert bf.length() == bigsi.graph.bloomfilter.size bigsi.delete_all()
def test_get_bloomfilter(seq): sample = "1234" kmers = seq_to_kmers(seq, 31) bigsi = BIGSI.create(m=10, force=True) bigsi.build([bigsi.bloom(kmers)], [sample]) bf = bigsi.get_bloom_filter(sample) assert bf.length() == bigsi.graph.bloomfilter.size bigsi.delete_all()
def kmer_reader(f): reader = Reader(f) for i, line in enumerate(reader): # if i % 100000 == 0: # sys.stderr.write(str(i)+'\n') # sys.stderr.flush() read = line.decode('utf-8') for k in seq_to_kmers(read): yield k
def test_merge(): for config in CONFIGS: get_storage(config).delete_all() config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ATACACAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) bigsi1 = BIGSI.build(CONFIGS[0], [bloom1], ["a"]) bigsi2 = BIGSI.build(CONFIGS[1], [bloom2], ["b"]) bigsic = BIGSI.build(CONFIGS[2], [bloom1, bloom2], ["a", "b"]) bigsi1.merge(bigsi2) assert bigsi1.search("ATACACAAT", 0.5) == bigsic.search("ATACACAAT", 0.5) bigsi1.delete() bigsi2.delete() bigsic.delete()
def test_jaccard_index3(kmers1, kmers2): kmers1=list(seq_to_kmers(kmers1)) kmers2=list(seq_to_kmers(kmers2)) mc=HyperLogLogJaccardIndex(host = REDIS_HOST, port = REDIS_PORT) mc.delete_all() mc.insert(kmers1, '1234') mc.insert(kmers2, '1235') skmers1=set(kmers1) skmers2=set(kmers2) true_sim=float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2)) true_sdiff=float(len(skmers1 ^ skmers2)) true_diff=float(len(skmers1 - skmers2)) ji=mc.jaccard_index('1234', '1235') sd=mc.symmetric_difference('1234', '1235') dd=mc.difference('1234', '1235') assert float(abs(ji-true_sim)) <= 0.2 assert float(abs(sd-true_sdiff)) <= 5 assert float(abs(dd - true_diff)) <= 5
def kmer_reader(f): count = 0 reader = Reader(f) for i, line in enumerate(reader): if i % 100000 == 0: sys.stderr.write(str(i) + '\n') sys.stderr.flush() read = line.decode('utf-8') for k in seq_to_kmers(read): count += 1 yield k sys.stderr.write(str(count))
def test_update_contains(colour, elements, bloom_filter_size, num_hashes): storage = ProbabilisticBerkeleyDBStorage( filename="db", bloom_filter_size=bloom_filter_size, num_hashes=num_hashes) elements = list(seq_to_kmers(elements, 31)) storage.bloom_filter_size = bloom_filter_size storage.num_hashes = num_hashes storage.bloomfilter.update(elements, colour) for k in elements: assert storage.bloomfilter.contains(k, colour) storage.delete_all()
def test_exact_search(): config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ACAGAGAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) for config in CONFIGS: get_storage(config).delete_all() bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"]) assert bigsi.search("ATACACAAT")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a", } assert bigsi.search("ACAGAGAAC")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "b", } assert bigsi.search("ACAGTTAAC") == [] bigsi.delete()
def test_insert_lookup_kmers(Graph, sample, seq, k, m, h): logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = list(seq_to_kmers(seq, k)) bigsi = Graph.create(m=m, k=k, h=h, force=True) bloom = bigsi.bloom(kmers) bigsi.insert(bloom, sample) for kmer in kmers: # assert sample not in bigsi.lookup(kmer+"T")[kmer+"T"] ba = bitarray() ba.frombytes(bigsi.lookup_raw(kmer)) assert ba[0] == True assert sample in bigsi.lookup(kmer)[kmer] assert [sample] in bigsi.lookup(kmers).values() bigsi.delete_all()
def test_insert_and_unique_sample_names(Graph, sample, seq, k, m, h): logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = seq_to_kmers(seq, k) m = 100 bigsi = Graph.create(m=m, k=k, h=h, force=True) assert bigsi.kmer_size == k bloom = bigsi.bloom(kmers) assert len(bloom) == m bigsi.insert(bloom, sample) with pytest.raises(ValueError): bigsi.insert(bloom, sample) assert sample in bigsi.search(seq) assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100 bigsi.delete_all()
def test_insert_lookup_kmers(): Graph, sample, seq = BIGSI, '0', 'AAAAAAAAAAAATCAAAAAAAAAAAAAAAAA' m, h, k = 10, 2, 31 logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = list(seq_to_kmers(seq, k)) bigsi = Graph.create(m=m, k=k, h=h, force=True) bloom = bigsi.bloom(kmers) bigsi.build([bloom], [sample]) for kmer in kmers: # assert sample not in bigsi.lookup(kmer+"T")[kmer+"T"] ba = bitarray() ba.frombytes(bigsi.lookup_raw(kmer)) assert ba[0] == True assert sample in bigsi.lookup(kmer)[kmer] assert [sample] in bigsi.lookup(kmers).values() bigsi.delete_all()
def test_insert_and_unique_sample_names(): Graph, sample = BIGSI, '0' seq, k, h = 'AATTTTTATTTTTTTTTTTTTAATTAATATT', 11, 1 m = 10 logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = seq_to_kmers(seq, k) bigsi = Graph.create(m=m, k=k, h=h, force=True) assert bigsi.kmer_size == k bloom = bigsi.bloom(kmers) assert len(bloom) == m with pytest.raises(ValueError): bigsi.insert(bloom, sample) bigsi.build([bloom], [sample]) with pytest.raises(ValueError): bigsi.insert(bloom, sample) assert sample in bigsi.search(seq) assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100 bigsi.delete_all()
def test_cant_write_to_read_only_index(): Graph, sample = BIGSI, "sfewe" seq, k, h = 'AATTTTTATTTTTTTTTTTTTAATTAATATT', 11, 1 m = 10 logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = seq_to_kmers(seq, k) bigsi = Graph.create(m=m, k=k, h=h, force=True) assert bigsi.kmer_size == k bloom = bigsi.bloom(kmers) bigsi.build([bloom], [sample]) os.chmod(bigsi.graph_filename, S_IREAD | S_IRGRP | S_IROTH) # Can write to a read only DB bigsi = Graph(mode="r") with pytest.raises(bsddb3.db.DBAccessError): bigsi.insert(bloom, "1234") assert sample in bigsi.search(seq) assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100 os.chmod(bigsi.graph_filename, S_IWUSR | S_IREAD) bigsi.delete_all()
def extract_kmers_from_ctx(ctx, k): gr = GraphReader(ctx) for i in gr: for kmer in seq_to_kmers(i.kmer.canonical_value, k): yield kmer
def test_jaccard_index1(mc, kmers): kmers = list(seq_to_kmers(kmers)) mc.delete_all() mc.insert(kmers, '1234') mc.insert(kmers, '1235') assert mc.jaccard_index('1234', '1235') == 1
def seq_to_kmers(self, seq): return seq_to_kmers(seq, self.kmer_size)
def _search(gene_name, seq, results, threshold, graph, output_format="json", pipe=False, score=False): if pipe: if output_format == "tsv": start = time.time() result = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start if result: for sample_id, percent in result.items(): print("\t".join([ gene_name, sample_id, str(percent["percent_kmers_found"]), str(diff) ])) else: print("\t".join([gene_name, "NA", str(0), str(diff)])) elif output_format == "fasta": samples = graph.sample_to_colour_lookup.keys() print(" ".join(['>', gene_name])) print(seq) result = graph.search(seq, threshold=threshold, score=score) result = sorted(result.items(), key=operator.itemgetter(1), reverse=True) for sample, percent in result: percent = round(percent * 100, 2) colour = int(graph.sample_to_colour_lookup.get(sample)) print(" ".join([ '>', gene_name, sample, "kmer-%i coverage %f" % (graph.kmer_size, percent) ])) presence = [] for kmer in seq_to_kmers(seq, graph.kmer_size): kmer_presence = graph.graph.lookup( convert_query_kmer(kmer))[colour] sys.stdout.write(str(int(kmer_presence))) sys.stdout.write('\n') else: result = {} start = time.time() result['results'] = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start result['time'] = diff print(json.dumps({gene_name: result})) else: results[gene_name] = {} start = time.time() results[gene_name]['results'] = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start results[gene_name]['time'] = diff return results