def make_minhash(genome, max_h, prime, ksize):
	kmers = set()
	name = os.path.basename(genome)
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(genome):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	#genome_sketches.append(MHS)
	# export the kmers
	fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w')
	#fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt')  # python3
	for kmer in kmers:
		fid.write("%s\n" % kmer)
	fid.close()
	return MHS
示例#2
0
def make_minhash(genome, max_h, prime, ksize):
    kmers = set()
    name = os.path.basename(genome)
    MHS = MH.CountEstimator(n=max_h,
                            max_prime=prime,
                            ksize=ksize,
                            save_kmers='y')
    for record in screed.open(genome):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                kmers.add(kmer)
                MHS.add(kmer)
            else:
                kmers.add(kmer_rev)
                MHS.add(kmer_rev)
    MHS._true_num_kmers = len(kmers)
    MHS.input_file_name = os.path.basename(genome)
    # Export the hash k-mers
    fid = open(
        os.path.abspath(
            os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w')
    for kmer in MHS._kmers:
        fid.write(">\n%s\n" % kmer)
    fid.close()
    return MHS
fid.create_dataset("hash_list", data=list(hash_list))
fid.close()
# If I need to read it back in
#fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r')
#hash_list = set(fid["hash_list"][:])

####################################
# Form a CE for a metagenome
n = 500
fid = h5py.File(
    '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'r')
hash_list = set(fid["hash_list"][...])
fid.close()
CE = MH.CountEstimator(
    n=n,
    max_prime=9999999999971.,
    ksize=31,
    input_file_name='/nfs1/Koslicki_Lab/koslickd/MinHash/Data/SRR172902.fastq',
    save_kmers='y')
CE.export('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N' +
          str(n) + '_k31_all.h5')
CE2 = MH.CountEstimator(
    n=n,
    max_prime=9999999999971.,
    ksize=31,
    input_file_name='/nfs1/Koslicki_Lab/koslickd/MinHash/Data/SRR172902.fastq',
    save_kmers='y',
    hash_list=hash_list)
CE2.export('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N' +
           str(n) + '_k31_inComparison.h5')
n = 5000
fid = h5py.File(
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc,
                           prime, p, ksize, hash_range):
    # Make a simulation
    simulation_file, abundances_file, selected_genomes = make_simulation(
        num_genomes, num_reads, python_loc, gen_sim_loc)

    # Get simulation k-mers, use canonical k-mers
    # Simultaneously, make the min hash sketch of the simulation
    simulation_kmers = set()
    simulation_MHS = MH.CountEstimator(n=max_h,
                                       max_prime=prime,
                                       ksize=ksize,
                                       save_kmers='y')
    for record in screed.open(simulation_file):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                simulation_kmers.add(kmer)
                simulation_MHS.add(kmer)
            else:
                simulation_kmers.add(kmer_rev)
                simulation_MHS.add(kmer_rev)

    # Use them to populate a bloom filter
    simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers),
                                   error_rate=p)
    simulation_kmers_length = len(
        simulation_kmers
    )  # in practice, this would be computed when the bloom filter is created
    # or can use an estimate based on the bloom filter entries
    for kmer in simulation_kmers:
        simulation_bloom.add(kmer)

    # Use pre-computed data to load the kmers and the sketches
    base_names = [os.path.basename(item) for item in selected_genomes]
    # Load the sketches
    genome_sketches = MH.import_multiple_from_single_hdf5(
        os.path.abspath('../data/Genomes/AllSketches.h5'), base_names)
    # Get the true number of kmers
    genome_lengths = list()
    for i in range(len(genome_sketches)):
        genome_lengths.append(genome_sketches[i]._true_num_kmers)

    # Get *all* the kmers for computation of ground truth
    genome_kmers = list()
    for i in range(len(base_names)):
        name = base_names[i]
        kmers = set()
        fid = bz2.BZ2File(
            os.path.abspath(
                os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r')
        for line in fid.readlines():
            kmers.add(line.strip())
        fid.close()
        genome_kmers.append(kmers)

    # Calculate the true Jaccard index
    true_jaccards = list()
    for kmers in genome_kmers:
        true_jaccard = len(kmers.intersection(simulation_kmers)) / float(
            len(kmers.union(simulation_kmers)))
        true_jaccards.append(true_jaccard)

    # Calculate the min hash estimate of jaccard index
    MH_relative_errors = list()
    CMH_relative_errors = list()
    for h in hash_range:
        MH_jaccards = list()
        for MHS in genome_sketches:
            # Down sample each sketch to h
            MHS.down_sample(h)
            simulation_MHS.down_sample(h)
            MH_jaccard = MHS.jaccard(simulation_MHS)
            MH_jaccards.append(MH_jaccard)

        MH_jaccards_corrected = list()
        for MHS in genome_sketches:
            MHS_set = set(MHS._mins)
            sample_set = set(simulation_MHS._mins)
            MH_jaccard = len(
                set(list(MHS_set.union(sample_set))[0:h]).intersection(
                    MHS_set.intersection(sample_set))) / float(h)
            MH_jaccards_corrected.append(MH_jaccard)

        # Calculate the containment min hash estimate of the jaccard index
        CMH_jaccards = list()
        for i in range(len(genome_sketches)):
            genome_kmers_len = genome_lengths[
                i]  # pre-computed when creating the "training" data
            MHS = genome_sketches[i]
            # down sample each sketch to h
            MHS.down_sample(h)
            kmers = MHS._kmers  # use only the k-mers in the min hash sketch
            int_est = 0
            for kmer in kmers:
                if kmer in simulation_bloom:  # test if the k-mers are in the simulation bloom filter
                    int_est += 1
            int_est -= p * h  # adjust for false positive rate
            containment_est = int_est / float(h)
            containment_est_jaccard = genome_kmers_len * containment_est / \
             (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est)
            CMH_jaccards.append(containment_est_jaccard)

        # compute the average deviation from the truth (relative error)
        true_jaccards = np.array(true_jaccards)
        MH_jaccards = np.array(MH_jaccards)
        CMH_jaccards = np.array(CMH_jaccards)
        MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards)
        CMH_mean = np.mean(
            np.abs(true_jaccards - CMH_jaccards) / true_jaccards)
        #print("Classic min hash mean relative error: %f" % MH_mean)
        #print("Containment min hash mean relative error: %f" % CMH_mean)
        MH_relative_errors.append(MH_mean)
        CMH_relative_errors.append(CMH_mean)

    # remove temp files
    os.remove(simulation_file)
    os.remove(abundances_file)
    # return the relative errors
    return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(
        genome_lengths)
示例#5
0
print small_string
print "len(small_string)", len(small_string)
size_A = len(
    set([
        small_string[i:i + ksize]
        for i in range(len(small_string) - ksize + 1)
    ])
)  # size of smaller set, used to convert containment index to Jaccard index
print "size_A", size_A
#large_string = ''.join(np.random.choice(['A', 'C', 'T', 'G'], len_large_string)) + small_string  # large string to form the larger set B
large_string = "CCGCATCGACAAGCAGGATCTGGATCTATTTCTCTCTTAAATCCATGTAAGGGACGGCAGAAACCTGCTCCTTCTACTTGCTACATCTTCTAGGGTAGAACGAGACCAGAGCCGTTACTGCGATATGAAATCAGTACCGAACGTTGGAACTTATTCAGTTTTAACCCGGTCCCCGTCGCCCAAATCGGGCTATATCATACCCCCGGGCCAAGTGTACAAGTGCATCGATTAAATGCACTAACGGCGAAAGTAAATGATGGACTTTCCAAGCCTGAGGTGGTAAACGCACTTGAATAGAGTCGACAAATTATCGGCTGACGATGCCTTGTAGACCAGCTTTAACACATGACCAGTATAGACGAGGCGGAACTAAGCAATCCCAAGTTTTCGTGCGAGCTGAAGGACCCGGCTCCACGAGATAGAGCTTGTGTTAACAAGAGGCCTCCGGCTGGAAAGATTGGTGGAAACGGCTGCTGTCACGTTTGCATCTTACCGGATGTGCCCCAATGAGGAGTTGATGAACTGGCTGTGACGCAATGGCGAAGAGGAAACGTCTGTATGGCGGATGTAACGTTTTTGCAACACTCCTCCACAACTGCTCCTTTAAGATGACCATCACGAAAATGAAGCTCGTTCGAAATCTTCAAAGATCCGGGGTATAATTGCGCTTCCGGGAGAAGGCCATATGCGATAGCGGTAAGTTTCCACAGCGTATCCAAAAGCGGAGCTTTACGATCTCCCCAGTAAACTGGCTTGTGTCAAGCGGCGAACCCGAATTTCGACGAACCTAGATATTCTCTGGCGACTAACTACTATGCGGATGGGCCTATTCGGGGGATTCAGCCCGCGATACTAGAGCGTAATTAGCCTCGCAAGAATCTAGGTAGCCCCAAAATAGCTTGCTAAAGCGCTAGGGTGCACTGCAGGCAAAATCGAGGTGACTGTACCCCGAGCCATGCATATAACTGGGGGGTACCCTTCCAATAATTGTTATCATACCATCTGCATAGACATATTTAACGGCTCAGTAAATTCGTCGCCATGCGACCTCCAGCATGATCGGTGGCACTCCGTTGTGCGCGGACTGTGTAAACCGCACG" + small_string
#print large_string
print "len(large_string):", len(large_string)

# Populate min hash sketch with smaller set
A_MH = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
A_MH.add_sequence(small_string)  # create the min hash of the small string

# Create the bloom filter and populate with the larger set
B_filt = BloomFilter(capacity=1.15 * len_large_string,
                     error_rate=p)  # Initialize the bloom filter
size_B_est = 0  # used to count the number of k-mers in B, could do much more intelligently (like with HyperLogLog)
for i in range(len(large_string) - ksize + 1):
    kmer = large_string[i:i + ksize]
    if kmer not in B_filt:
        size_B_est += 1
        B_filt.add(kmer)

print "size_B_est :", size_B_est

# Use the k-mers in the sketch of A and test if they are in the bloom filter of B
示例#6
0
import MinHash as MH
import screed


reload(MH)
fid = open('/home/dkoslicki/Dropbox/Repositories/MinHash/data/test_files.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]
CE = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[0], save_kmers='y')
CE2 = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[1], save_kmers='y')
CE2.jaccard_count(CE)
CE2.jaccard(CE)
CE.jaccard(CE2)

CS = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[0])
CS2 = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[1])
CS.jaccard_count(CS2)
CS.jaccard(CS2)
CS2.jaccard(CS)


i = 0
for record in screed.open(file_names[0]):
    for kmer in MH.kmers(record.sequence,10):
        if kmer == 'TGGAATTCCA':
            i += 1

print(i)

CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y')