def classify(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize() relationships = fe.process_classify(line) for r in relationships: rel = r[0] shingles = r[1] # compute signatures sigs = MinHash.signature(shingles.getvalue().split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8")) else: classified_r = (rel.e1, rel.e2, rel.sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def load_shingles(shingles_file): """ Parses already extracted shingles from a file. File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n """ relationships = [] rel_identifier = 0 f_shingles = codecs.open(shingles_file, encoding='utf-8') for line in f_shingles: sys.stdout.write('.') rel_type, shingles_string = line.split('\t') shingles = shingles_string.strip().split(' ') # calculate min-hash sigs sigs = MinHash.signature(shingles, N_SIGS) rel = Relationship(None, None, None, None, None, None, None, None, rel_type, rel_identifier) rel.sigs = sigs rel.identifier = rel_identifier relationships.append(rel) rel_identifier += 1 f_shingles.close() return relationships
def index_shingles(shingles_file, n_bands, n_sigs, knn): """ Parses already extracted shingles from a file. File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n """ f_shingles = codecs.open(shingles_file, encoding='utf-8') relationships = [] print "Reading features file" for line in f_shingles: rel_id, rel_type, shingles = line.split('\t') shingles = shingles.strip().split(' ') relationships.append((rel_type, rel_id, shingles)) f_shingles.close() print "SIGS :", n_sigs print "BANDS :", n_bands lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn) lsh.create() count = 0 elapsed_time = 0 for r in relationships: start_time = time.time() sigs = MinHash.signature(r[2], n_sigs) lsh.index(r[0], r[1], sigs) elapsed_time += time.time() - start_time count += 1 if count % 100 == 0: sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n") sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
def classify2(queue, n_sigs, lsh, child_conn): count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: r = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process( ), count, " processed, remaining ", queue.qsize() e1 = r[0] e2 = r[1] sentence = r[2] shingles = r[3] # compute signatures sigs = MinHash.signature(shingles.split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (e1, e2, sentence, types.encode("utf8")) else: classified_r = (e1, e2, sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def classify(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process( ), count, " processed, remaining ", queue.qsize() relationships = fe.process_classify(line) for r in relationships: rel = r[0] shingles = r[1] # compute signatures sigs = MinHash.signature(shingles.getvalue().split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8")) else: classified_r = (rel.e1, rel.e2, rel.sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def extract_features(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() relationships = [] count = 0 while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print count, " processed, remaining ", queue.qsize() rel_id, rel_type, e1, e2, sentence = line.split('\t') rel_id = int(rel_id.split(":")[1]) shingles = fe.process_index(sentence, e1, e2) try: shingles = shingles.getvalue().strip().split(' ') except AttributeError, e: print line print shingles sys.exit(-1) sigs = MinHash.signature(shingles, n_sigs) lsh.index(rel_type, rel_id, sigs) relationships.append((rel_type, rel_id, sigs, shingles)) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(relationships) break
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) #genome_sketches.append(MHS) # export the kmers fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w') #fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt') # python3 for kmer in kmers: fid.write("%s\n" % kmer) fid.close() return MHS
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) # Export the hash k-mers fid = open( os.path.abspath( os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w') for kmer in MHS._kmers: fid.write(">\n%s\n" % kmer) fid.close() return MHS
def classify2(queue, n_sigs, lsh, child_conn): count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: r = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize() e1 = r[0] e2 = r[1] sentence = r[2] shingles = r[3] # compute signatures sigs = MinHash.signature(shingles.split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (e1, e2, sentence, types.encode("utf8")) else: classified_r = (e1, e2, sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def classify_sentences(data_file, extractor, lsh): """ receives a file with sentences where entities are identified and for each classifies the existing relationships between the entities :param data_file: :param extractor: :param lsh: :return: """ f_sentences = codecs.open(data_file, encoding='utf-8') for line in f_sentences: if len(line) > 1: sentence = line.strip() sentence = Sentence(sentence) for rel in sentence.relationships: if rel.arg1type is None and rel.arg2type is None: continue else: # extract features/shingles features = extractor.extract_features(rel) shingles = features.getvalue().strip().split(' ') # calculate min-hash sigs sigs = MinHash.signature(shingles, N_SIGS) rel.sigs = sigs # find closest neighbours types = lsh.classify(rel) print rel.sentence.encode("utf8") + '\targ1:'+rel.ent1.encode("utf8") + '\targ2:ent2'+rel.ent2.encode("utf8") + '\t' + types.encode("utf8") f_sentences.close()
def load_training_relationships(data_file, extractor): """ :param data_file: :param extractor: :return: """ relationships = [] sentence = None rel_id = 0 f_sentences = codecs.open(data_file, encoding='utf-8') f_features = open('features.txt', 'w') for line in f_sentences: #sys.stdout.write('.') if not re.match('^relation', line): sentence = line.strip() else: rel_type = line.strip().split(':')[1] rel = Relationship(sentence, None, None, None, None, None, None, None, rel_type, rel_id) # extract features/shingles features = extractor.extract_features(rel) shingles = features.getvalue().strip().split(' ') # write shingles to StringIO f_features.write(rel_type + '\t') for shingle in shingles: f_features.write(shingle.encode("utf8") + ' ') f_features.write('\n') # calculate min-hash sigs sigs = MinHash.signature(shingles, N_SIGS) rel.sigs = sigs rel.identifier = rel_id rel_id += 1 relationships.append(rel) f_sentences.close() f_features.close() return relationships
def main(): import timeit import MinHash as MH small_database_file = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_1000.h5" TST_export_file_new = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_new.tst" TST_export_file_old = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_old.tst" # new way t0 = timeit.default_timer() M = MakeTSTNew(small_database_file, TST_export_file_new) M.make_TST() t1 = timeit.default_timer() print(f"New timing: {t1 - t0}") # old way t0 = timeit.default_timer() CEs = MH.import_multiple_from_single_hdf5(small_database_file) M = MakeTSTOld(CEs, TST_export_file_old) M.make_TST() t1 = timeit.default_timer() print(f"Old timing: {t1 - t0}")
'/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/') import os, timeit, h5py import MinHash as MH import numpy as np fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5( CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File( '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range): # Make a simulation simulation_file, abundances_file, selected_genomes = make_simulation(num_genomes, num_reads, python_loc, gen_sim_loc) # Get simulation k-mers, use canonical k-mers # Simultaneously, make the min hash sketch of the simulation simulation_kmers = set() simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(simulation_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: simulation_kmers.add(kmer) simulation_MHS.add(kmer) else: simulation_kmers.add(kmer_rev) simulation_MHS.add(kmer_rev) # Use them to populate a bloom filter simulation_bloom = BloomFilter(capacity=1.1*len(simulation_kmers), error_rate=p) simulation_kmers_length = len(simulation_kmers) # in practice, this would be computed when the bloom filter is created # or can use an estimate based on the bloom filter entries for kmer in simulation_kmers: simulation_bloom.add(kmer) # Use pre-computed data to load the kmers and the sketches base_names = [os.path.basename(item) for item in selected_genomes] # Load the sketches genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Genomes/AllSketches.h5'), base_names) # Get the true number of kmers genome_lengths = list() for i in range(len(genome_sketches)): genome_lengths.append(genome_sketches[i]._true_num_kmers) # Get *all* the kmers for computation of ground truth genome_kmers = list() for i in range(len(base_names)): name = base_names[i] kmers = set() fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r') for line in fid.readlines(): kmers.add(line.strip()) fid.close() genome_kmers.append(kmers) # Calculate the true Jaccard index true_jaccards = list() for kmers in genome_kmers: true_jaccard = len(kmers.intersection(simulation_kmers)) / float(len(kmers.union(simulation_kmers))) true_jaccards.append(true_jaccard) # Calculate the min hash estimate of jaccard index MH_relative_errors = list() CMH_relative_errors = list() for h in hash_range: MH_jaccards = list() for MHS in genome_sketches: # Down sample each sketch to h MHS.down_sample(h) simulation_MHS.down_sample(h) MH_jaccard = MHS.jaccard(simulation_MHS) MH_jaccards.append(MH_jaccard) MH_jaccards_corrected = list() for MHS in genome_sketches: MHS_set = set(MHS._mins) sample_set = set(simulation_MHS._mins) MH_jaccard = len(set(list(MHS_set.union(sample_set))[0:h]).intersection(MHS_set.intersection(sample_set))) / float(h) MH_jaccards_corrected.append(MH_jaccard) # Calculate the containment min hash estimate of the jaccard index CMH_jaccards = list() for i in range(len(genome_sketches)): genome_kmers_len = genome_lengths[i] # pre-computed when creating the "training" data MHS = genome_sketches[i] # down sample each sketch to h MHS.down_sample(h) kmers = MHS._kmers # use only the k-mers in the min hash sketch int_est = 0 for kmer in kmers: if kmer in simulation_bloom: # test if the k-mers are in the simulation bloom filter int_est += 1 int_est -= p*h # adjust for false positive rate containment_est = int_est / float(h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est) CMH_jaccards.append(containment_est_jaccard) # compute the average deviation from the truth (relative error) true_jaccards = np.array(true_jaccards) MH_jaccards = np.array(MH_jaccards) CMH_jaccards = np.array(CMH_jaccards) MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards)/true_jaccards) CMH_mean = np.mean(np.abs(true_jaccards - CMH_jaccards)/true_jaccards) #print("Classic min hash mean relative error: %f" % MH_mean) #print("Containment min hash mean relative error: %f" % CMH_mean) MH_relative_errors.append(MH_mean) CMH_relative_errors.append(CMH_mean) # remove temp files os.remove(simulation_file) os.remove(abundances_file) # return the relative errors return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(genome_lengths)
import sys sys.path.append('/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/') import os, timeit, h5py import MinHash as MH import numpy as np fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5(CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in #fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r') #hash_list = set(fid["hash_list"][:]) n = 5000 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
fid = open(os.path.abspath("../Paper/Data/MetagenomeTotalKmers.txt"), 'r') metagenome_kmers_total = int(fid.readlines()[0].strip()) fid.close() # Get virus names file_names = list() fid = open(os.path.abspath('../data/Viruses/FileNames.txt'), 'r') for line in fid.readlines(): file_names.append( os.path.abspath( os.path.join('../data/Viruses/', os.path.basename(line.strip())))) fid.close() # Get all the hashes so we know the size base_names = [os.path.basename(item) for item in file_names] genome_sketches = MH.import_multiple_from_single_hdf5( os.path.abspath('../data/Viruses/AllSketches.h5'), base_names) # query the bloom filter def count_jaccard(genome_sketch, file_name, ksize, p, max_h, metagenome_kmers_total): name = os.path.basename(file_name) CMH = genome_sketch genome_kmers_len = CMH._true_num_kmers cmd = query_per_sequence_loc + " " + metagenome_bloom_filter + " " + \ os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash" + str(ksize) + "mers.fa")) int_est = int(subprocess.check_output(cmd, shell=True)) int_est -= p * int_est containment_est = int_est / float(max_h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + metagenome_kmers_total - genome_kmers_len * containment_est)
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range): # Make a simulation simulation_file, abundances_file, selected_genomes = make_simulation( num_genomes, num_reads, python_loc, gen_sim_loc) # Get simulation k-mers, use canonical k-mers # Simultaneously, make the min hash sketch of the simulation simulation_kmers = set() simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(simulation_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: simulation_kmers.add(kmer) simulation_MHS.add(kmer) else: simulation_kmers.add(kmer_rev) simulation_MHS.add(kmer_rev) # Use them to populate a bloom filter simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers), error_rate=p) simulation_kmers_length = len( simulation_kmers ) # in practice, this would be computed when the bloom filter is created # or can use an estimate based on the bloom filter entries for kmer in simulation_kmers: simulation_bloom.add(kmer) # Use pre-computed data to load the kmers and the sketches base_names = [os.path.basename(item) for item in selected_genomes] # Load the sketches genome_sketches = MH.import_multiple_from_single_hdf5( os.path.abspath('../data/Genomes/AllSketches.h5'), base_names) # Get the true number of kmers genome_lengths = list() for i in range(len(genome_sketches)): genome_lengths.append(genome_sketches[i]._true_num_kmers) # Get *all* the kmers for computation of ground truth genome_kmers = list() for i in range(len(base_names)): name = base_names[i] kmers = set() fid = bz2.BZ2File( os.path.abspath( os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r') for line in fid.readlines(): kmers.add(line.strip()) fid.close() genome_kmers.append(kmers) # Calculate the true Jaccard index true_jaccards = list() for kmers in genome_kmers: true_jaccard = len(kmers.intersection(simulation_kmers)) / float( len(kmers.union(simulation_kmers))) true_jaccards.append(true_jaccard) # Calculate the min hash estimate of jaccard index MH_relative_errors = list() CMH_relative_errors = list() for h in hash_range: MH_jaccards = list() for MHS in genome_sketches: # Down sample each sketch to h MHS.down_sample(h) simulation_MHS.down_sample(h) MH_jaccard = MHS.jaccard(simulation_MHS) MH_jaccards.append(MH_jaccard) MH_jaccards_corrected = list() for MHS in genome_sketches: MHS_set = set(MHS._mins) sample_set = set(simulation_MHS._mins) MH_jaccard = len( set(list(MHS_set.union(sample_set))[0:h]).intersection( MHS_set.intersection(sample_set))) / float(h) MH_jaccards_corrected.append(MH_jaccard) # Calculate the containment min hash estimate of the jaccard index CMH_jaccards = list() for i in range(len(genome_sketches)): genome_kmers_len = genome_lengths[ i] # pre-computed when creating the "training" data MHS = genome_sketches[i] # down sample each sketch to h MHS.down_sample(h) kmers = MHS._kmers # use only the k-mers in the min hash sketch int_est = 0 for kmer in kmers: if kmer in simulation_bloom: # test if the k-mers are in the simulation bloom filter int_est += 1 int_est -= p * h # adjust for false positive rate containment_est = int_est / float(h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est) CMH_jaccards.append(containment_est_jaccard) # compute the average deviation from the truth (relative error) true_jaccards = np.array(true_jaccards) MH_jaccards = np.array(MH_jaccards) CMH_jaccards = np.array(CMH_jaccards) MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards) CMH_mean = np.mean( np.abs(true_jaccards - CMH_jaccards) / true_jaccards) #print("Classic min hash mean relative error: %f" % MH_mean) #print("Containment min hash mean relative error: %f" % CMH_mean) MH_relative_errors.append(MH_mean) CMH_relative_errors.append(CMH_mean) # remove temp files os.remove(simulation_file) os.remove(abundances_file) # return the relative errors return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean( genome_lengths)
if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) #genome_sketches.append(MHS) # export the kmers fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w') #fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt') # python3 for kmer in kmers: fid.write("%s\n" % kmer) fid.close() return MHS def make_minhash_star(arg): return make_minhash(*arg) pool = Pool(processes=num_threads) genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) pool.close() pool.join() dummy = [len(item._kmers) for item in genome_sketches] # to get it to actually do the work # Export all the sketches base_names = [os.path.basename(item) for item in file_names] MH.export_multiple_to_single_hdf5(genome_sketches, os.path.abspath('../data/Genomes/AllSketches.h5'))
import MinHash as MH import screed reload(MH) fid = open('/home/dkoslicki/Dropbox/Repositories/MinHash/data/test_files.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] CE = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[0], save_kmers='y') CE2 = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[1], save_kmers='y') CE2.jaccard_count(CE) CE2.jaccard(CE) CE.jaccard(CE2) CS = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[0]) CS2 = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[1]) CS.jaccard_count(CS2) CS.jaccard(CS2) CS2.jaccard(CS) i = 0 for record in screed.open(file_names[0]): for kmer in MH.kmers(record.sequence,10): if kmer == 'TGGAATTCCA': i += 1 print(i) CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y')
#metagenome_kmers_total = 916485607 # via jellyfish stats on a count bloom filter (need to streamline this) # Note that this number changed since I am restricting myself to the paired reads (no orphaned guys) fid = open(os.path.abspath("../Paper/Data/MetagenomeTotalKmers.txt"), 'r') metagenome_kmers_total = int(fid.readlines()[0].strip()) fid.close() # Get virus names file_names = list() fid = open(os.path.abspath('../data/Viruses/FileNames.txt'), 'r') for line in fid.readlines(): file_names.append(os.path.abspath(os.path.join('../data/Viruses/', os.path.basename(line.strip())))) fid.close() # Get all the hashes so we know the size base_names = [os.path.basename(item) for item in file_names] genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Viruses/AllSketches.h5'), base_names) # query the bloom filter def count_jaccard(genome_sketch, file_name, ksize, p, max_h, metagenome_kmers_total): name = os.path.basename(file_name) CMH = genome_sketch genome_kmers_len = CMH._true_num_kmers cmd = query_per_sequence_loc + " " + metagenome_bloom_filter + " " + \ os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash" + str(ksize) + "mers.fa")) int_est = int(subprocess.check_output(cmd, shell=True)) int_est -= p*int_est containment_est = int_est / float(max_h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + metagenome_kmers_total - genome_kmers_len * containment_est) return containment_est_jaccard
print small_string print "len(small_string)", len(small_string) size_A = len( set([ small_string[i:i + ksize] for i in range(len(small_string) - ksize + 1) ]) ) # size of smaller set, used to convert containment index to Jaccard index print "size_A", size_A #large_string = ''.join(np.random.choice(['A', 'C', 'T', 'G'], len_large_string)) + small_string # large string to form the larger set B large_string = "CCGCATCGACAAGCAGGATCTGGATCTATTTCTCTCTTAAATCCATGTAAGGGACGGCAGAAACCTGCTCCTTCTACTTGCTACATCTTCTAGGGTAGAACGAGACCAGAGCCGTTACTGCGATATGAAATCAGTACCGAACGTTGGAACTTATTCAGTTTTAACCCGGTCCCCGTCGCCCAAATCGGGCTATATCATACCCCCGGGCCAAGTGTACAAGTGCATCGATTAAATGCACTAACGGCGAAAGTAAATGATGGACTTTCCAAGCCTGAGGTGGTAAACGCACTTGAATAGAGTCGACAAATTATCGGCTGACGATGCCTTGTAGACCAGCTTTAACACATGACCAGTATAGACGAGGCGGAACTAAGCAATCCCAAGTTTTCGTGCGAGCTGAAGGACCCGGCTCCACGAGATAGAGCTTGTGTTAACAAGAGGCCTCCGGCTGGAAAGATTGGTGGAAACGGCTGCTGTCACGTTTGCATCTTACCGGATGTGCCCCAATGAGGAGTTGATGAACTGGCTGTGACGCAATGGCGAAGAGGAAACGTCTGTATGGCGGATGTAACGTTTTTGCAACACTCCTCCACAACTGCTCCTTTAAGATGACCATCACGAAAATGAAGCTCGTTCGAAATCTTCAAAGATCCGGGGTATAATTGCGCTTCCGGGAGAAGGCCATATGCGATAGCGGTAAGTTTCCACAGCGTATCCAAAAGCGGAGCTTTACGATCTCCCCAGTAAACTGGCTTGTGTCAAGCGGCGAACCCGAATTTCGACGAACCTAGATATTCTCTGGCGACTAACTACTATGCGGATGGGCCTATTCGGGGGATTCAGCCCGCGATACTAGAGCGTAATTAGCCTCGCAAGAATCTAGGTAGCCCCAAAATAGCTTGCTAAAGCGCTAGGGTGCACTGCAGGCAAAATCGAGGTGACTGTACCCCGAGCCATGCATATAACTGGGGGGTACCCTTCCAATAATTGTTATCATACCATCTGCATAGACATATTTAACGGCTCAGTAAATTCGTCGCCATGCGACCTCCAGCATGATCGGTGGCACTCCGTTGTGCGCGGACTGTGTAAACCGCACG" + small_string #print large_string print "len(large_string):", len(large_string) # Populate min hash sketch with smaller set A_MH = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') A_MH.add_sequence(small_string) # create the min hash of the small string # Create the bloom filter and populate with the larger set B_filt = BloomFilter(capacity=1.15 * len_large_string, error_rate=p) # Initialize the bloom filter size_B_est = 0 # used to count the number of k-mers in B, could do much more intelligently (like with HyperLogLog) for i in range(len(large_string) - ksize + 1): kmer = large_string[i:i + ksize] if kmer not in B_filt: size_B_est += 1 B_filt.add(kmer) print "size_B_est :", size_B_est # Use the k-mers in the sketch of A and test if they are in the bloom filter of B
import sys, os sys.path.insert( 0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/CMash/CMash") import MinHash as MH import itertools training_database = sys.argv[1] # first input is the training file name dump_file = sys.argv[2] # second input is the desired output dump file CEs = MH.import_multiple_from_single_hdf5(training_database) fid = open(dump_file, 'w') i = 0 for CE in CEs: for kmer in CE._kmers: fid.write('>seq%d\n' % i) fid.write('%s\n' % kmer) i += 1 fid.close()