def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) #genome_sketches.append(MHS) # export the kmers fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w') #fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt') # python3 for kmer in kmers: fid.write("%s\n" % kmer) fid.close() return MHS
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) # Export the hash k-mers fid = open( os.path.abspath( os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w') for kmer in MHS._kmers: fid.write(">\n%s\n" % kmer) fid.close() return MHS
fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in #fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r') #hash_list = set(fid["hash_list"][:]) #################################### # Form a CE for a metagenome n = 500 fid = h5py.File( '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'r') hash_list = set(fid["hash_list"][...]) fid.close() CE = MH.CountEstimator( n=n, max_prime=9999999999971., ksize=31, input_file_name='/nfs1/Koslicki_Lab/koslickd/MinHash/Data/SRR172902.fastq', save_kmers='y') CE.export('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N' + str(n) + '_k31_all.h5') CE2 = MH.CountEstimator( n=n, max_prime=9999999999971., ksize=31, input_file_name='/nfs1/Koslicki_Lab/koslickd/MinHash/Data/SRR172902.fastq', save_kmers='y', hash_list=hash_list) CE2.export('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N' + str(n) + '_k31_inComparison.h5') n = 5000 fid = h5py.File(
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range): # Make a simulation simulation_file, abundances_file, selected_genomes = make_simulation( num_genomes, num_reads, python_loc, gen_sim_loc) # Get simulation k-mers, use canonical k-mers # Simultaneously, make the min hash sketch of the simulation simulation_kmers = set() simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(simulation_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: simulation_kmers.add(kmer) simulation_MHS.add(kmer) else: simulation_kmers.add(kmer_rev) simulation_MHS.add(kmer_rev) # Use them to populate a bloom filter simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers), error_rate=p) simulation_kmers_length = len( simulation_kmers ) # in practice, this would be computed when the bloom filter is created # or can use an estimate based on the bloom filter entries for kmer in simulation_kmers: simulation_bloom.add(kmer) # Use pre-computed data to load the kmers and the sketches base_names = [os.path.basename(item) for item in selected_genomes] # Load the sketches genome_sketches = MH.import_multiple_from_single_hdf5( os.path.abspath('../data/Genomes/AllSketches.h5'), base_names) # Get the true number of kmers genome_lengths = list() for i in range(len(genome_sketches)): genome_lengths.append(genome_sketches[i]._true_num_kmers) # Get *all* the kmers for computation of ground truth genome_kmers = list() for i in range(len(base_names)): name = base_names[i] kmers = set() fid = bz2.BZ2File( os.path.abspath( os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r') for line in fid.readlines(): kmers.add(line.strip()) fid.close() genome_kmers.append(kmers) # Calculate the true Jaccard index true_jaccards = list() for kmers in genome_kmers: true_jaccard = len(kmers.intersection(simulation_kmers)) / float( len(kmers.union(simulation_kmers))) true_jaccards.append(true_jaccard) # Calculate the min hash estimate of jaccard index MH_relative_errors = list() CMH_relative_errors = list() for h in hash_range: MH_jaccards = list() for MHS in genome_sketches: # Down sample each sketch to h MHS.down_sample(h) simulation_MHS.down_sample(h) MH_jaccard = MHS.jaccard(simulation_MHS) MH_jaccards.append(MH_jaccard) MH_jaccards_corrected = list() for MHS in genome_sketches: MHS_set = set(MHS._mins) sample_set = set(simulation_MHS._mins) MH_jaccard = len( set(list(MHS_set.union(sample_set))[0:h]).intersection( MHS_set.intersection(sample_set))) / float(h) MH_jaccards_corrected.append(MH_jaccard) # Calculate the containment min hash estimate of the jaccard index CMH_jaccards = list() for i in range(len(genome_sketches)): genome_kmers_len = genome_lengths[ i] # pre-computed when creating the "training" data MHS = genome_sketches[i] # down sample each sketch to h MHS.down_sample(h) kmers = MHS._kmers # use only the k-mers in the min hash sketch int_est = 0 for kmer in kmers: if kmer in simulation_bloom: # test if the k-mers are in the simulation bloom filter int_est += 1 int_est -= p * h # adjust for false positive rate containment_est = int_est / float(h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est) CMH_jaccards.append(containment_est_jaccard) # compute the average deviation from the truth (relative error) true_jaccards = np.array(true_jaccards) MH_jaccards = np.array(MH_jaccards) CMH_jaccards = np.array(CMH_jaccards) MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards) CMH_mean = np.mean( np.abs(true_jaccards - CMH_jaccards) / true_jaccards) #print("Classic min hash mean relative error: %f" % MH_mean) #print("Containment min hash mean relative error: %f" % CMH_mean) MH_relative_errors.append(MH_mean) CMH_relative_errors.append(CMH_mean) # remove temp files os.remove(simulation_file) os.remove(abundances_file) # return the relative errors return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean( genome_lengths)
print small_string print "len(small_string)", len(small_string) size_A = len( set([ small_string[i:i + ksize] for i in range(len(small_string) - ksize + 1) ]) ) # size of smaller set, used to convert containment index to Jaccard index print "size_A", size_A #large_string = ''.join(np.random.choice(['A', 'C', 'T', 'G'], len_large_string)) + small_string # large string to form the larger set B large_string = "CCGCATCGACAAGCAGGATCTGGATCTATTTCTCTCTTAAATCCATGTAAGGGACGGCAGAAACCTGCTCCTTCTACTTGCTACATCTTCTAGGGTAGAACGAGACCAGAGCCGTTACTGCGATATGAAATCAGTACCGAACGTTGGAACTTATTCAGTTTTAACCCGGTCCCCGTCGCCCAAATCGGGCTATATCATACCCCCGGGCCAAGTGTACAAGTGCATCGATTAAATGCACTAACGGCGAAAGTAAATGATGGACTTTCCAAGCCTGAGGTGGTAAACGCACTTGAATAGAGTCGACAAATTATCGGCTGACGATGCCTTGTAGACCAGCTTTAACACATGACCAGTATAGACGAGGCGGAACTAAGCAATCCCAAGTTTTCGTGCGAGCTGAAGGACCCGGCTCCACGAGATAGAGCTTGTGTTAACAAGAGGCCTCCGGCTGGAAAGATTGGTGGAAACGGCTGCTGTCACGTTTGCATCTTACCGGATGTGCCCCAATGAGGAGTTGATGAACTGGCTGTGACGCAATGGCGAAGAGGAAACGTCTGTATGGCGGATGTAACGTTTTTGCAACACTCCTCCACAACTGCTCCTTTAAGATGACCATCACGAAAATGAAGCTCGTTCGAAATCTTCAAAGATCCGGGGTATAATTGCGCTTCCGGGAGAAGGCCATATGCGATAGCGGTAAGTTTCCACAGCGTATCCAAAAGCGGAGCTTTACGATCTCCCCAGTAAACTGGCTTGTGTCAAGCGGCGAACCCGAATTTCGACGAACCTAGATATTCTCTGGCGACTAACTACTATGCGGATGGGCCTATTCGGGGGATTCAGCCCGCGATACTAGAGCGTAATTAGCCTCGCAAGAATCTAGGTAGCCCCAAAATAGCTTGCTAAAGCGCTAGGGTGCACTGCAGGCAAAATCGAGGTGACTGTACCCCGAGCCATGCATATAACTGGGGGGTACCCTTCCAATAATTGTTATCATACCATCTGCATAGACATATTTAACGGCTCAGTAAATTCGTCGCCATGCGACCTCCAGCATGATCGGTGGCACTCCGTTGTGCGCGGACTGTGTAAACCGCACG" + small_string #print large_string print "len(large_string):", len(large_string) # Populate min hash sketch with smaller set A_MH = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') A_MH.add_sequence(small_string) # create the min hash of the small string # Create the bloom filter and populate with the larger set B_filt = BloomFilter(capacity=1.15 * len_large_string, error_rate=p) # Initialize the bloom filter size_B_est = 0 # used to count the number of k-mers in B, could do much more intelligently (like with HyperLogLog) for i in range(len(large_string) - ksize + 1): kmer = large_string[i:i + ksize] if kmer not in B_filt: size_B_est += 1 B_filt.add(kmer) print "size_B_est :", size_B_est # Use the k-mers in the sketch of A and test if they are in the bloom filter of B
import MinHash as MH import screed reload(MH) fid = open('/home/dkoslicki/Dropbox/Repositories/MinHash/data/test_files.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] CE = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[0], save_kmers='y') CE2 = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[1], save_kmers='y') CE2.jaccard_count(CE) CE2.jaccard(CE) CE.jaccard(CE2) CS = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[0]) CS2 = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[1]) CS.jaccard_count(CS2) CS.jaccard(CS2) CS2.jaccard(CS) i = 0 for record in screed.open(file_names[0]): for kmer in MH.kmers(record.sequence,10): if kmer == 'TGGAATTCCA': i += 1 print(i) CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y')