import sys sys.path.append('/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/') import os, timeit, h5py import MinHash as MH import numpy as np fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5(CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in #fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r') #hash_list = set(fid["hash_list"][:]) n = 5000 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
'/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/') import os, timeit, h5py import MinHash as MH import numpy as np fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5( CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File( '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in
CS.jaccard(CS2) CS2.jaccard(CS) i = 0 for record in screed.open(file_names[0]): for kmer in MH.kmers(record.sequence,10): if kmer == 'TGGAATTCCA': i += 1 print(i) CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y') #Compute a bunch of them and save to a single, big HDF5 file CEs = MH.compute_multiple(n=500,ksize=11,input_files_list=file_names,save_kmers='y') MH.export_multiple_to_single_hdf5(CEs, 'test_big.h5') #Load them back in CEs = MH.import_multiple_from_single_hdf5('test_big.h5') #load just a few in CEs = MH.import_multiple_from_single_hdf5('test_big.h5', file_names[0:2]) # Let's look at forming the Y vector CEs = MH.import_multiple_hdf5(out_file_names) MCE = MH.import_single_hdf5('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N500_k31_inComparison.h5') Y = np.zeros(len(CEs)) for i in range(len(CEs)): Y[i] = CEs[i].jaccard_count(MCE)[1]