fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5( CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File( '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in #fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r') #hash_list = set(fid["hash_list"][:]) n = 5000 CEs = MH.compute_multiple(n=n,
sys.path.append('/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/') import os, timeit, h5py import MinHash as MH import numpy as np fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r') file_names = fid.readlines() fid.close() file_names = [name.strip() for name in file_names] ############################### # Compute the hashes for all the training genomes n = 500 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export MH.export_multiple_hdf5(CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31/') # Save the hashes in the training genomes hash_list = set() for CE in CEs: hash_list.update(CE._mins) fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31_mins.h5', 'w') fid.create_dataset("hash_list", data=list(hash_list)) fid.close() # If I need to read it back in #fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r') #hash_list = set(fid["hash_list"][:]) n = 5000 CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48) # Export