fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]

###############################
# Compute the hashes for all the training genomes
n = 500
CEs = MH.compute_multiple(n=n,
                          max_prime=9999999999971.,
                          ksize=31,
                          input_files_list=file_names,
                          save_kmers='y',
                          num_threads=48)
# Export
MH.export_multiple_hdf5(
    CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31/')
# Save the hashes in the training genomes
hash_list = set()
for CE in CEs:
    hash_list.update(CE._mins)

fid = h5py.File(
    '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'w')
fid.create_dataset("hash_list", data=list(hash_list))
fid.close()
# If I need to read it back in
#fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r')
#hash_list = set(fid["hash_list"][:])

n = 5000
CEs = MH.compute_multiple(n=n,
sys.path.append('/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/')
import os, timeit, h5py
import MinHash as MH
import numpy as np

fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]

###############################
# Compute the hashes for all the training genomes
n = 500
CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
# Export
MH.export_multiple_hdf5(CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31/')
# Save the hashes in the training genomes
hash_list = set()
for CE in CEs:
    hash_list.update(CE._mins)

fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31_mins.h5', 'w')
fid.create_dataset("hash_list", data=list(hash_list))
fid.close()
# If I need to read it back in
#fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r')
#hash_list = set(fid["hash_list"][:])

n = 5000
CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
# Export