CS2.jaccard(CS) i = 0 for record in screed.open(file_names[0]): for kmer in MH.kmers(record.sequence,10): if kmer == 'TGGAATTCCA': i += 1 print(i) CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y') #Compute a bunch of them and save to a single, big HDF5 file CEs = MH.compute_multiple(n=500,ksize=11,input_files_list=file_names,save_kmers='y') MH.export_multiple_to_single_hdf5(CEs, 'test_big.h5') #Load them back in CEs = MH.import_multiple_from_single_hdf5('test_big.h5') #load just a few in CEs = MH.import_multiple_from_single_hdf5('test_big.h5', file_names[0:2]) # Let's look at forming the Y vector CEs = MH.import_multiple_hdf5(out_file_names) MCE = MH.import_single_hdf5('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N500_k31_inComparison.h5') Y = np.zeros(len(CEs)) for i in range(len(CEs)): Y[i] = CEs[i].jaccard_count(MCE)[1]
if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) #genome_sketches.append(MHS) # export the kmers fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w') #fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt') # python3 for kmer in kmers: fid.write("%s\n" % kmer) fid.close() return MHS def make_minhash_star(arg): return make_minhash(*arg) pool = Pool(processes=num_threads) genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) pool.close() pool.join() dummy = [len(item._kmers) for item in genome_sketches] # to get it to actually do the work # Export all the sketches base_names = [os.path.basename(item) for item in file_names] MH.export_multiple_to_single_hdf5(genome_sketches, os.path.abspath('../data/Genomes/AllSketches.h5'))