def test_multi_read_dataset_insert(): bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'GCGT', 'IIII') read_2 = Read('a.fastq', 'b', None, 'AAAG', 'IIII') bloom_tree.insert([read_1, read_2]) assert bloom_tree.root is not None assert bloom_tree.root.parent is None assert bloom_tree.root.children == [] assert bloom_tree.root.dataset_id == 'a.fastq' assert bloom_tree.root.filter.contains('GCG') assert bloom_tree.root.filter.contains('CGT') assert bloom_tree.root.filter.contains('AAA') assert bloom_tree.root.filter.contains('AAG')
def test_single_read(): """ Ensures the cuckoo tree can 1. create the new node and set it as root 2. insert all 3-mers from dataset into the filter 3. set the dataset_id of the leaf """ bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'GCGT', 'IIII') bloom_tree.insert([read_1]) assert bloom_tree.root is not None assert bloom_tree.root.parent is None assert bloom_tree.root.children == [] assert bloom_tree.root.dataset_id == 'a.fastq' assert bloom_tree.root.filter.contains('GCG') assert bloom_tree.root.filter.contains('CGT')
def test_construction(): """ Ensures the bloom tree is constructed as we would expect """ bloom_tree = BloomTree(0.5, 3, 100000, 0.03) assert bloom_tree.root is None assert bloom_tree.theta == 0.5 assert bloom_tree.k == 3 assert bloom_tree.fp_prob == 0.03 assert bloom_tree.expected_num == 100000
def test_single_read_queries(): """ Ensures that we can query a single leaf/single node tree """ bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'GCGT', 'IIII') bloom_tree.insert([read_1]) assert bloom_tree.query('AAAA') == [] assert bloom_tree.query('GCGT') == ['a.fastq'] assert bloom_tree.query('ACGT') == ['a.fastq'] assert bloom_tree.query('GCGA') == ['a.fastq'] bloom_tree.theta = 0.6 assert bloom_tree.query('AAAA') == [] assert bloom_tree.query('GCGT') == ['a.fastq'] assert bloom_tree.query('ACGT') == [] assert bloom_tree.query('GCGA') == []
def test_best_child_selection(): bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'ABCD', 'IIII') read_2 = Read('b.fastq', 'b', None, 'EFGH', 'IIII') read_3 = Read('c.fastq', 'c', None, 'ZABC', 'IIII') read_4 = Read('d.fastq', 'd', None, 'ABCD', 'IIII') bloom_tree.insert([read_1]) bloom_tree.insert([read_2]) bloom_tree.insert([read_3]) bloom_tree.insert([read_4]) # b shares no kmers with other side assert bloom_tree.root.children[1].dataset_id == 'b.fastq' assert bloom_tree.root.children[1].children == [] assert bloom_tree.root.children[1].parent == bloom_tree.root # dataset c has 1 different kmer than a and d left_subtree = bloom_tree.root.children[0] assert left_subtree.dataset_id is None assert left_subtree.num_children() == 2 assert left_subtree.children[1].dataset_id == 'c.fastq' # a and d should have same parent since they have same info left_left_subtree = left_subtree.children[0] assert left_left_subtree.dataset_id is None assert left_left_subtree.children[0].dataset_id == 'a.fastq' assert left_left_subtree.children[1].dataset_id == 'd.fastq'
def test_two_internal_nodes(): bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'ABCD', 'IIII') read_2 = Read('b.fastq', 'b', None, 'EFGH', 'IIII') read_3 = Read('c.fastq', 'c', None, 'ZABC', 'IIII') bloom_tree.insert([read_1]) bloom_tree.insert([read_2]) bloom_tree.insert([read_3]) root = bloom_tree.root assert root.parent is None assert root.num_children() == 2 assert root.dataset_id is None left_internal = root.children[0] assert left_internal.dataset_id is None assert left_internal.num_children() == 2 right_leaf = root.children[1] assert right_leaf.dataset_id == 'b.fastq' left_most_read = left_internal.children[0] right_read = left_internal.children[1] assert left_most_read.parent == left_internal assert left_most_read.children == [] assert right_read.parent == left_internal assert right_read.children == []
def test_create_internal_node(): """ Ensure that cuckoo tree can 1. create an internal node 2. set 2 datasets as children of internal node 2. put kmer info of both datasets into internal node 3. internal node has no dataset_id """ bloom_tree = BloomTree(0.5, 3, 100000, 0.03) read_1 = Read('a.fastq', 'a', None, 'ABCD', 'IIII') read_2 = Read('b.fastq', 'b', None, 'EFGH', 'IIII') bloom_tree.insert([read_1]) read_1_leaf = bloom_tree.root bloom_tree.insert([read_2]) internal_node = bloom_tree.root idx = internal_node.children.index(read_1_leaf) read_2_leaf = internal_node.children[(idx + 1) % 2] assert internal_node.dataset_id is None assert internal_node.num_children() == 2 assert internal_node.parent is None assert read_1_leaf in internal_node.children assert read_2_leaf in internal_node.children assert read_1_leaf.parent == internal_node assert read_2_leaf.parent == internal_node assert read_1_leaf.children == [] assert read_2_leaf.children == [] assert read_1_leaf.dataset_id == 'a.fastq' assert read_2_leaf.dataset_id == 'b.fastq' assert internal_node.filter.contains('ABC') assert internal_node.filter.contains('BCD') assert internal_node.filter.contains('EFG') assert internal_node.filter.contains('FGH') assert not read_1_leaf.filter.contains('EFG') assert not read_1_leaf.filter.contains('FGH') assert not read_2_leaf.filter.contains('ABC') assert not read_2_leaf.filter.contains('BCD')
def test_two_read_queries(): """ Ensures that we can query correctly through an internal node and that changing theta works accordingly """ bloom_tree = BloomTree(0.3, 3, 100000, 0.03) # note: have 1 3mer in common read_1 = Read('a.fastq', 'a', None, 'ABCDE', 'IIII') read_2 = Read('b.fastq', 'b', None, 'CDEFG', 'IIII') bloom_tree.insert([read_1]) bloom_tree.insert([read_2]) assert bloom_tree.query('AAAAA') == [] assert bloom_tree.query('BCDEF') == ['a.fastq', 'b.fastq'] assert bloom_tree.query('CDEFG') == ['a.fastq', 'b.fastq'] assert bloom_tree.query('ABCDE') == ['a.fastq', 'b.fastq'] bloom_tree.theta = 0.6 assert bloom_tree.query('AAAAA') == [] assert bloom_tree.query('BCDEF') == ['a.fastq', 'b.fastq'] assert bloom_tree.query('CDEFG') == ['b.fastq'] assert bloom_tree.query('ABCDE') == ['a.fastq'] bloom_tree.theta = 0.9 assert bloom_tree.query('AAAAA') == [] assert bloom_tree.query('BCDEF') == [] assert bloom_tree.query('CDEFG') == ['b.fastq'] assert bloom_tree.query('ABCDE') == ['a.fastq']
def main(myKmer_size, myFpr, myThreshold, querySequencing): bac_folder = sys.argv[1] ref_folder = sys.argv[2] test_folder = sys.argv[3] #reads in fasta files bac_dict = dataParse(bac_folder) ref_dict = dataParse(ref_folder) test_dict = dataParse(test_folder) #creates datasets n = 500 good_dict = goodData(ref_dict, n) if querySequencing == True: bad_dict = badData(test_dict, n) if querySequencing == False: bad_dict = badData(bac_dict, n) kmer_size = myKmer_size def convertReadtoKmerList(read): kmer_list = [] for i in range(len(read) - kmer_size + 1): kmer = read[i:i + kmer_size] kmer_list.append(kmer) return kmer_list total_kmerList = [] for species in bac_dict: species_dict = bac_dict[species] for readKey in species_dict: read = species_dict[readKey] kmerList = convertReadtoKmerList(read) total_kmerList = total_kmerList + kmerList fpr = myFpr BFsize = getBFsize(len(total_kmerList), fpr) BFHashCount = getHashFunctionCount(len(total_kmerList), BFsize) bloomTreeConstructionStartTime = time.time() #print("Constructing species BFs") BFList = [] for species in bac_dict: species_dict = bac_dict[species] kmerList = [] for readKey in species_dict: read = species_dict[readKey] kmers = convertReadtoKmerList(read) kmerList = kmerList + kmers species = str(species) addBF = BloomFilter(species, kmerList, BFsize, BFHashCount) BFList.append(addBF) #print("Constructing Bloom Tree") from bloom_tree import BloomTree from bloom_node import BloomNode inverseBloomTree = BloomTree(myThreshold, BFsize, BFHashCount) for bloomFilter in BFList: newNode = BloomNode(bloomFilter) inverseBloomTree.add(newNode) bloomTreeConstructionEndTime = time.time() bloomTreeSize = sys.getsizeof(inverseBloomTree) numTotal = 0 numAccurate = 0 totalMatches = 0 confusionMatrix = np.zeros(shape=(6, 6)) #print("Querying") queryStartTime = time.time() rowNumber = -1 for species in bad_dict: rowNumber += 1 species_BadDict = bad_dict[species] for readID in species_BadDict: read = species_BadDict[readID] queryKmers = convertReadtoKmerList(read) possibleMatches = inverseBloomTree.query(queryKmers) possibleMatchNames = [] for match in possibleMatches: possibleMatchNames.append(match.bloom_filter.getName()) speciesName = str(species) if speciesName in possibleMatchNames: numAccurate += 1 numTotal += 1 totalMatches += len(possibleMatchNames) if speciesName == "b_vulgatus": confusionMatrix[rowNumber, 3] += 1 if speciesName == "bacillus_simplex": confusionMatrix[rowNumber, 2] += 1 if speciesName == "klebsiella_pneumoniae": confusionMatrix[rowNumber, 0] += 1 if speciesName == "p_glucanolyticus": confusionMatrix[rowNumber, 1] += 1 if speciesName == "staph_lentus": confusionMatrix[rowNumber, 4] += 1 else: numTotal += 1 totalMatches += len(possibleMatchNames) confusionMatrix[rowNumber, 5] += 1 queryEndTime = time.time() accuracy = numAccurate / numTotal avgMatches = totalMatches / numTotal print("Kmer_length: " + str(kmer_size)) print("FPR: " + str(fpr)) print("Query Threshold: " + str(myThreshold)) print("Accuracy: " + str(accuracy)) print("AvgMatches: " + str(avgMatches)) numMouseMatches = 0 numQueries = 0 for readKey in good_dict: read = good_dict[readKey] queryKmers = convertReadtoKmerList(read) possibleMatches = inverseBloomTree.query(queryKmers) numMouseMatches += len(possibleMatches) numQueries += 1 if "b_vulgatus" in possibleMatches: confusionMatrix[5, 0] += 1 if "bacillus_simplex" in possibleMatches: confusionMatrix[5, 1] += 1 if "klebsiella_pneumoniae" in possibleMatches: confusionMatrix[5, 2] += 1 if "p_glucanolyticus" in possibleMatches: confusionMatrix[5, 3] += 1 if "staph_lentus" in possibleMatches: confusionMatrix[5, 4] += 1 if len(possibleMatches) == 0: confusionMatrix[5, 5] += 1 avgMouseMatches = numMouseMatches / numQueries print("avgMouseMatches: " + str(avgMouseMatches)) print("Bloom Filter size: " + str(BFsize)) print("Hash Count: " + str(BFHashCount)) print("Bloom Tree size: " + str(bloomTreeSize)) bloomTreeConstructionTime = bloomTreeConstructionEndTime - bloomTreeConstructionStartTime queryTime = queryEndTime - queryStartTime print("Construction Time: " + str(bloomTreeConstructionTime)) print("Query Time: " + str(queryTime)) ##Note, query time measured without construction of confusion matrix print("Confusion Matrix: ") print(confusionMatrix)