示例#1
0
CS2.jaccard(CS)


i = 0
for record in screed.open(file_names[0]):
    for kmer in MH.kmers(record.sequence,10):
        if kmer == 'TGGAATTCCA':
            i += 1

print(i)

CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y')

#Compute a bunch of them and save to a single, big HDF5 file
CEs = MH.compute_multiple(n=500,ksize=11,input_files_list=file_names,save_kmers='y')
MH.export_multiple_to_single_hdf5(CEs, 'test_big.h5')
#Load them back in
CEs = MH.import_multiple_from_single_hdf5('test_big.h5')
#load just a few in
CEs = MH.import_multiple_from_single_hdf5('test_big.h5', file_names[0:2])


# Let's look at forming the Y vector
CEs = MH.import_multiple_hdf5(out_file_names)
MCE = MH.import_single_hdf5('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/SRR172902.fastq.CE_N500_k31_inComparison.h5')
Y = np.zeros(len(CEs))
for i in range(len(CEs)):
    Y[i] = CEs[i].jaccard_count(MCE)[1]


			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	#genome_sketches.append(MHS)
	# export the kmers
	fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w')
	#fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt')  # python3
	for kmer in kmers:
		fid.write("%s\n" % kmer)
	fid.close()
	return MHS


def make_minhash_star(arg):
	return make_minhash(*arg)

pool = Pool(processes=num_threads)
genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
pool.close()
pool.join()
dummy = [len(item._kmers) for item in genome_sketches]  # to get it to actually do the work

# Export all the sketches
base_names = [os.path.basename(item) for item in file_names]
MH.export_multiple_to_single_hdf5(genome_sketches, os.path.abspath('../data/Genomes/AllSketches.h5'))