CEs = MH.import_multiple_hdf5(out_file_names)
Y_count_in_comparison = MCE_in_comparison.count_vector(CEs)
eps = .001
(reconstruction, A_eps,
 A_indicies) = MH.jaccard_count_lsqnonneg(CEs, Y_count_in_comparison, eps)

# Read in the taxonomy and see which are the largest entries of which Y vectors
fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/Taxonomy.txt', 'r')
taxonomy = fid.readlines()
fid.close()
taxonomy = [item.strip() for item in taxonomy]
taxonomy_names = [item.split('\t')[0] for item in taxonomy]

out_dir = '/scratch/temp/SNAP/training/'
(clusters, LCAs) = MH.cluster_matrix(A_eps,
                                     A_indicies,
                                     taxonomy,
                                     cluster_eps=.01)
training_file_names = MH.make_cluster_fastas(out_dir, LCAs, clusters, CEs)
index_dirs = MH.build_references(training_file_names,
                                 out_dir,
                                 large_index=True)

sorted_clusters = [
    item[1] for item in sorted(zip([
        sum([reconstruction[i] for i in clusters[j]])
        for j in range(len(clusters))
    ], clusters),
                               reverse=True)
]
sorted_LCAs = [
    item[1] for item in sorted(zip([
training_n = 5000
out_file_names = ["/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N"+str(training_n)+"k31/" + os.path.basename(item) + ".CE.h5" for item in file_names]
CEs = MH.import_multiple_hdf5(out_file_names)
Y_count_in_comparison = MCE_in_comparison.count_vector(CEs)
eps = .001
(reconstruction, A_eps, A_indicies) = MH.jaccard_count_lsqnonneg(CEs, Y_count_in_comparison, eps)

# Read in the taxonomy and see which are the largest entries of which Y vectors
fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/Taxonomy.txt', 'r')
taxonomy = fid.readlines()
fid.close()
taxonomy = [item.strip() for item in taxonomy]
taxonomy_names = [item.split('\t')[0] for item in taxonomy]

out_dir = '/scratch/temp/SNAP/training/'
(clusters, LCAs) = MH.cluster_matrix(A_eps, A_indicies, taxonomy, cluster_eps=.01)
training_file_names = MH.make_cluster_fastas(out_dir, LCAs, clusters, CEs)
index_dirs = MH.build_references(training_file_names, out_dir, large_index=True)

sorted_clusters = [item[1] for item in sorted(zip([sum([reconstruction[i] for i in clusters[j]]) for j in range(len(clusters))], clusters), reverse=True)]
sorted_LCAs = [item[1] for item in sorted(zip([sum([reconstruction[i] for i in clusters[j]]) for j in range(len(clusters))], LCAs), reverse=True)]
sorted_index_dirs = [item[1] for item in sorted(zip([sum([reconstruction[i] for i in clusters[j]]) for j in range(len(clusters))], index_dirs), reverse=True)]
sorted_training_file_names = [item[1] for item in sorted(zip([sum([reconstruction[i] for i in clusters[j]]) for j in range(len(clusters))], training_file_names), reverse=True)]

out_file = os.path.join(out_dir, os.path.basename(soil_sample_file) + "_unaligned.sam")
t0 = timeit.default_timer()
MH.stream_align_single(sorted_index_dirs, soil_sample_file, out_file, format="sam", filt="all")
#MH.stream_align_single(index_dirs, soil_sample_file, out_file, format="bam", filt="all")
t1 = timeit.default_timer()
print("Alignment time: %f" % (t1-t0))  # 8752.601766s == 2.43 hours