def test_single_read_queries():
    """
    Ensures that we can query a single leaf/single node tree
    """
    bloom_tree = BloomTree(0.5, 3, 100000, 0.03)
    read_1 = Read('a.fastq', 'a', None, 'GCGT', 'IIII')
    bloom_tree.insert([read_1])
    assert bloom_tree.query('AAAA') == []
    assert bloom_tree.query('GCGT') == ['a.fastq']
    assert bloom_tree.query('ACGT') == ['a.fastq']
    assert bloom_tree.query('GCGA') == ['a.fastq']
    bloom_tree.theta = 0.6
    assert bloom_tree.query('AAAA') == []
    assert bloom_tree.query('GCGT') == ['a.fastq']
    assert bloom_tree.query('ACGT') == []
    assert bloom_tree.query('GCGA') == []
def test_two_read_queries():
    """
    Ensures that we can query correctly through an internal node
    and that changing theta works accordingly
    """
    bloom_tree = BloomTree(0.3, 3, 100000, 0.03)
    # note: have 1  3mer in common
    read_1 = Read('a.fastq', 'a', None, 'ABCDE', 'IIII')
    read_2 = Read('b.fastq', 'b', None, 'CDEFG', 'IIII')
    bloom_tree.insert([read_1])
    bloom_tree.insert([read_2])
    assert bloom_tree.query('AAAAA') == []
    assert bloom_tree.query('BCDEF') == ['a.fastq', 'b.fastq']
    assert bloom_tree.query('CDEFG') == ['a.fastq', 'b.fastq']
    assert bloom_tree.query('ABCDE') == ['a.fastq', 'b.fastq']
    bloom_tree.theta = 0.6
    assert bloom_tree.query('AAAAA') == []
    assert bloom_tree.query('BCDEF') == ['a.fastq', 'b.fastq']
    assert bloom_tree.query('CDEFG') == ['b.fastq']
    assert bloom_tree.query('ABCDE') == ['a.fastq']
    bloom_tree.theta = 0.9
    assert bloom_tree.query('AAAAA') == []
    assert bloom_tree.query('BCDEF') == []
    assert bloom_tree.query('CDEFG') == ['b.fastq']
    assert bloom_tree.query('ABCDE') == ['a.fastq']
Пример #3
0
def main(myKmer_size, myFpr, myThreshold, querySequencing):

    bac_folder = sys.argv[1]
    ref_folder = sys.argv[2]
    test_folder = sys.argv[3]

    #reads in fasta files
    bac_dict = dataParse(bac_folder)
    ref_dict = dataParse(ref_folder)
    test_dict = dataParse(test_folder)

    #creates datasets
    n = 500
    good_dict = goodData(ref_dict, n)

    if querySequencing == True:
        bad_dict = badData(test_dict, n)

    if querySequencing == False:
        bad_dict = badData(bac_dict, n)

    kmer_size = myKmer_size

    def convertReadtoKmerList(read):
        kmer_list = []
        for i in range(len(read) - kmer_size + 1):
            kmer = read[i:i + kmer_size]
            kmer_list.append(kmer)
        return kmer_list

    total_kmerList = []
    for species in bac_dict:
        species_dict = bac_dict[species]
        for readKey in species_dict:
            read = species_dict[readKey]
            kmerList = convertReadtoKmerList(read)
            total_kmerList = total_kmerList + kmerList

    fpr = myFpr
    BFsize = getBFsize(len(total_kmerList), fpr)
    BFHashCount = getHashFunctionCount(len(total_kmerList), BFsize)

    bloomTreeConstructionStartTime = time.time()
    #print("Constructing species BFs")
    BFList = []
    for species in bac_dict:
        species_dict = bac_dict[species]
        kmerList = []
        for readKey in species_dict:
            read = species_dict[readKey]
            kmers = convertReadtoKmerList(read)
            kmerList = kmerList + kmers
        species = str(species)
        addBF = BloomFilter(species, kmerList, BFsize, BFHashCount)
        BFList.append(addBF)

    #print("Constructing Bloom Tree")

    from bloom_tree import BloomTree
    from bloom_node import BloomNode
    inverseBloomTree = BloomTree(myThreshold, BFsize, BFHashCount)
    for bloomFilter in BFList:
        newNode = BloomNode(bloomFilter)
        inverseBloomTree.add(newNode)

    bloomTreeConstructionEndTime = time.time()

    bloomTreeSize = sys.getsizeof(inverseBloomTree)
    numTotal = 0
    numAccurate = 0
    totalMatches = 0
    confusionMatrix = np.zeros(shape=(6, 6))

    #print("Querying")

    queryStartTime = time.time()

    rowNumber = -1
    for species in bad_dict:
        rowNumber += 1
        species_BadDict = bad_dict[species]
        for readID in species_BadDict:
            read = species_BadDict[readID]
            queryKmers = convertReadtoKmerList(read)
            possibleMatches = inverseBloomTree.query(queryKmers)
            possibleMatchNames = []
            for match in possibleMatches:
                possibleMatchNames.append(match.bloom_filter.getName())
            speciesName = str(species)
            if speciesName in possibleMatchNames:
                numAccurate += 1
                numTotal += 1
                totalMatches += len(possibleMatchNames)
                if speciesName == "b_vulgatus":
                    confusionMatrix[rowNumber, 3] += 1
                if speciesName == "bacillus_simplex":
                    confusionMatrix[rowNumber, 2] += 1
                if speciesName == "klebsiella_pneumoniae":
                    confusionMatrix[rowNumber, 0] += 1
                if speciesName == "p_glucanolyticus":
                    confusionMatrix[rowNumber, 1] += 1
                if speciesName == "staph_lentus":
                    confusionMatrix[rowNumber, 4] += 1
            else:
                numTotal += 1
                totalMatches += len(possibleMatchNames)
                confusionMatrix[rowNumber, 5] += 1

    queryEndTime = time.time()

    accuracy = numAccurate / numTotal
    avgMatches = totalMatches / numTotal
    print("Kmer_length: " + str(kmer_size))
    print("FPR: " + str(fpr))
    print("Query Threshold: " + str(myThreshold))
    print("Accuracy: " + str(accuracy))
    print("AvgMatches: " + str(avgMatches))

    numMouseMatches = 0
    numQueries = 0
    for readKey in good_dict:
        read = good_dict[readKey]
        queryKmers = convertReadtoKmerList(read)
        possibleMatches = inverseBloomTree.query(queryKmers)
        numMouseMatches += len(possibleMatches)
        numQueries += 1
        if "b_vulgatus" in possibleMatches:
            confusionMatrix[5, 0] += 1
        if "bacillus_simplex" in possibleMatches:
            confusionMatrix[5, 1] += 1
        if "klebsiella_pneumoniae" in possibleMatches:
            confusionMatrix[5, 2] += 1
        if "p_glucanolyticus" in possibleMatches:
            confusionMatrix[5, 3] += 1
        if "staph_lentus" in possibleMatches:
            confusionMatrix[5, 4] += 1
        if len(possibleMatches) == 0:
            confusionMatrix[5, 5] += 1
    avgMouseMatches = numMouseMatches / numQueries

    print("avgMouseMatches: " + str(avgMouseMatches))

    print("Bloom Filter size: " + str(BFsize))
    print("Hash Count: " + str(BFHashCount))
    print("Bloom Tree size: " + str(bloomTreeSize))
    bloomTreeConstructionTime = bloomTreeConstructionEndTime - bloomTreeConstructionStartTime
    queryTime = queryEndTime - queryStartTime
    print("Construction Time: " + str(bloomTreeConstructionTime))
    print("Query Time: " + str(queryTime))
    ##Note, query time measured without construction of confusion matrix

    print("Confusion Matrix: ")
    print(confusionMatrix)