def classify(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize()

            relationships = fe.process_classify(line)

            for r in relationships:
                rel = r[0]
                shingles = r[1]

                # compute signatures
                sigs = MinHash.signature(shingles.getvalue().split(), n_sigs)

                # find closest neighbours
                types = lsh.classify(sigs)
                if types is not None:
                    classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8"))
                else:
                    classified_r = (rel.e1, rel.e2, rel.sentence, "None")
                classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
def load_shingles(shingles_file):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    relationships = []
    rel_identifier = 0
    f_shingles = codecs.open(shingles_file, encoding='utf-8')

    for line in f_shingles:
        sys.stdout.write('.')
        rel_type, shingles_string = line.split('\t')
        shingles = shingles_string.strip().split(' ')

        # calculate min-hash sigs
        sigs = MinHash.signature(shingles, N_SIGS)

        rel = Relationship(None, None, None, None, None, None, None, None, rel_type, rel_identifier)
        rel.sigs = sigs
        rel.identifier = rel_identifier
        relationships.append(rel)
        rel_identifier += 1
    
    f_shingles.close()

    return relationships
示例#3
0
def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) +
                             " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
示例#4
0
def classify2(queue, n_sigs, lsh, child_conn):
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            r = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(
                ), count, " processed, remaining ", queue.qsize()

            e1 = r[0]
            e2 = r[1]
            sentence = r[2]
            shingles = r[3]

            # compute signatures
            sigs = MinHash.signature(shingles.split(), n_sigs)

            # find closest neighbours
            types = lsh.classify(sigs)
            if types is not None:
                classified_r = (e1, e2, sentence, types.encode("utf8"))
            else:
                classified_r = (e1, e2, sentence, "None")

            classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
示例#5
0
def classify(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(
                ), count, " processed, remaining ", queue.qsize()

            relationships = fe.process_classify(line)

            for r in relationships:
                rel = r[0]
                shingles = r[1]

                # compute signatures
                sigs = MinHash.signature(shingles.getvalue().split(), n_sigs)

                # find closest neighbours
                types = lsh.classify(sigs)
                if types is not None:
                    classified_r = (rel.e1, rel.e2, rel.sentence,
                                    types.encode("utf8"))
                else:
                    classified_r = (rel.e1, rel.e2, rel.sentence, "None")
                classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
示例#6
0
def extract_features(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    relationships = []
    count = 0
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print count, " processed, remaining ", queue.qsize()

            rel_id, rel_type, e1, e2, sentence = line.split('\t')
            rel_id = int(rel_id.split(":")[1])
            shingles = fe.process_index(sentence, e1, e2)

            try:
                shingles = shingles.getvalue().strip().split(' ')
            except AttributeError, e:
                print line
                print shingles
                sys.exit(-1)

            sigs = MinHash.signature(shingles, n_sigs)
            lsh.index(rel_type, rel_id, sigs)
            relationships.append((rel_type, rel_id, sigs, shingles))

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(relationships)
            break
def make_minhash(genome, max_h, prime, ksize):
	kmers = set()
	name = os.path.basename(genome)
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(genome):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	#genome_sketches.append(MHS)
	# export the kmers
	fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w')
	#fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt')  # python3
	for kmer in kmers:
		fid.write("%s\n" % kmer)
	fid.close()
	return MHS
def extract_features(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    relationships = []
    count = 0
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print count, " processed, remaining ", queue.qsize()

            rel_id, rel_type, e1, e2, sentence = line.split('\t')
            rel_id = int(rel_id.split(":")[1])
            shingles = fe.process_index(sentence, e1, e2)

            try:
                shingles = shingles.getvalue().strip().split(' ')
            except AttributeError, e:
                print line
                print shingles
                sys.exit(-1)

            sigs = MinHash.signature(shingles, n_sigs)
            lsh.index(rel_type, rel_id, sigs)
            relationships.append((rel_type, rel_id, sigs, shingles))

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(relationships)
            break
def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
示例#10
0
def make_minhash(genome, max_h, prime, ksize):
    kmers = set()
    name = os.path.basename(genome)
    MHS = MH.CountEstimator(n=max_h,
                            max_prime=prime,
                            ksize=ksize,
                            save_kmers='y')
    for record in screed.open(genome):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                kmers.add(kmer)
                MHS.add(kmer)
            else:
                kmers.add(kmer_rev)
                MHS.add(kmer_rev)
    MHS._true_num_kmers = len(kmers)
    MHS.input_file_name = os.path.basename(genome)
    # Export the hash k-mers
    fid = open(
        os.path.abspath(
            os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w')
    for kmer in MHS._kmers:
        fid.write(">\n%s\n" % kmer)
    fid.close()
    return MHS
示例#11
0
def classify2(queue, n_sigs, lsh, child_conn):
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            r = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize()

            e1 = r[0]
            e2 = r[1]
            sentence = r[2]
            shingles = r[3]

            # compute signatures
            sigs = MinHash.signature(shingles.split(), n_sigs)

            # find closest neighbours
            types = lsh.classify(sigs)
            if types is not None:
                classified_r = (e1, e2, sentence, types.encode("utf8"))
            else:
                classified_r = (e1, e2, sentence, "None")

            classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
def classify_sentences(data_file, extractor, lsh):
    """
    receives a file with sentences where entities are identified
    and for each classifies the existing relationships between the entities

    :param data_file:
    :param extractor:
    :param lsh:
    :return:
    """
    f_sentences = codecs.open(data_file, encoding='utf-8')
    for line in f_sentences:
        if len(line) > 1:
            sentence = line.strip()
            sentence = Sentence(sentence)
            for rel in sentence.relationships:
                if rel.arg1type is None and rel.arg2type is None:
                    continue
                else:
                    # extract features/shingles
                    features = extractor.extract_features(rel)
                    shingles = features.getvalue().strip().split(' ')

                    # calculate min-hash sigs
                    sigs = MinHash.signature(shingles, N_SIGS)
                    rel.sigs = sigs

                    # find closest neighbours
                    types = lsh.classify(rel)
                    print rel.sentence.encode("utf8") + '\targ1:'+rel.ent1.encode("utf8") + '\targ2:ent2'+rel.ent2.encode("utf8") + '\t' + types.encode("utf8")

    f_sentences.close()
def load_training_relationships(data_file, extractor):
    """

    :param data_file:
    :param extractor:
    :return:
    """
    relationships = []
    sentence = None
    rel_id = 0
    f_sentences = codecs.open(data_file, encoding='utf-8')
    f_features = open('features.txt', 'w')

    for line in f_sentences:
        #sys.stdout.write('.')
        if not re.match('^relation', line):
            sentence = line.strip()
        else:
            rel_type = line.strip().split(':')[1]
            rel = Relationship(sentence, None, None, None, None, None, None, None, rel_type, rel_id)

            # extract features/shingles
            features = extractor.extract_features(rel)
            shingles = features.getvalue().strip().split(' ')

            # write shingles to StringIO
            f_features.write(rel_type + '\t')
            for shingle in shingles:
                f_features.write(shingle.encode("utf8") + ' ')
            f_features.write('\n')

            # calculate min-hash sigs
            sigs = MinHash.signature(shingles, N_SIGS)
            rel.sigs = sigs
            rel.identifier = rel_id

            rel_id += 1
            relationships.append(rel)

    f_sentences.close()
    f_features.close()

    return relationships
示例#14
0
def main():
    import timeit
    import MinHash as MH
    small_database_file = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_1000.h5"
    TST_export_file_new = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_new.tst"
    TST_export_file_old = "/home/dkoslicki/Desktop/CMash/tests/TempData/cmash_db_n5000_k60_old.tst"

    # new way
    t0 = timeit.default_timer()
    M = MakeTSTNew(small_database_file, TST_export_file_new)
    M.make_TST()
    t1 = timeit.default_timer()
    print(f"New timing: {t1 - t0}")

    # old way
    t0 = timeit.default_timer()
    CEs = MH.import_multiple_from_single_hdf5(small_database_file)
    M = MakeTSTOld(CEs, TST_export_file_old)
    M.make_TST()
    t1 = timeit.default_timer()
    print(f"Old timing: {t1 - t0}")
    '/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/')
import os, timeit, h5py
import MinHash as MH
import numpy as np

fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]

###############################
# Compute the hashes for all the training genomes
n = 500
CEs = MH.compute_multiple(n=n,
                          max_prime=9999999999971.,
                          ksize=31,
                          input_files_list=file_names,
                          save_kmers='y',
                          num_threads=48)
# Export
MH.export_multiple_hdf5(
    CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31/')
# Save the hashes in the training genomes
hash_list = set()
for CE in CEs:
    hash_list.update(CE._mins)

fid = h5py.File(
    '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N' + str(n) + 'k31_mins.h5', 'w')
fid.create_dataset("hash_list", data=list(hash_list))
fid.close()
# If I need to read it back in
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range):
	# Make a simulation
	simulation_file, abundances_file, selected_genomes = make_simulation(num_genomes, num_reads, python_loc, gen_sim_loc)

	# Get simulation k-mers, use canonical k-mers
	# Simultaneously, make the min hash sketch of the simulation
	simulation_kmers = set()
	simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(simulation_file):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				simulation_kmers.add(kmer)
				simulation_MHS.add(kmer)
			else:
				simulation_kmers.add(kmer_rev)
				simulation_MHS.add(kmer_rev)

	# Use them to populate a bloom filter
	simulation_bloom = BloomFilter(capacity=1.1*len(simulation_kmers), error_rate=p)
	simulation_kmers_length = len(simulation_kmers)  # in practice, this would be computed when the bloom filter is created
	# or can use an estimate based on the bloom filter entries
	for kmer in simulation_kmers:
		simulation_bloom.add(kmer)

	# Use pre-computed data to load the kmers and the sketches
	base_names = [os.path.basename(item) for item in selected_genomes]
	# Load the sketches
	genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Genomes/AllSketches.h5'), base_names)
	# Get the true number of kmers
	genome_lengths = list()
	for i in range(len(genome_sketches)):
		genome_lengths.append(genome_sketches[i]._true_num_kmers)

	# Get *all* the kmers for computation of ground truth
	genome_kmers = list()
	for i in range(len(base_names)):
		name = base_names[i]
		kmers = set()
		fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r')
		for line in fid.readlines():
			kmers.add(line.strip())
		fid.close()
		genome_kmers.append(kmers)

	# Calculate the true Jaccard index
	true_jaccards = list()
	for kmers in genome_kmers:
		true_jaccard = len(kmers.intersection(simulation_kmers)) / float(len(kmers.union(simulation_kmers)))
		true_jaccards.append(true_jaccard)

	# Calculate the min hash estimate of jaccard index
	MH_relative_errors = list()
	CMH_relative_errors = list()
	for h in hash_range:
		MH_jaccards = list()
		for MHS in genome_sketches:
			# Down sample each sketch to h
			MHS.down_sample(h)
			simulation_MHS.down_sample(h)
			MH_jaccard = MHS.jaccard(simulation_MHS)
			MH_jaccards.append(MH_jaccard)

		MH_jaccards_corrected = list()
		for MHS in genome_sketches:
			MHS_set = set(MHS._mins)
			sample_set = set(simulation_MHS._mins)
			MH_jaccard = len(set(list(MHS_set.union(sample_set))[0:h]).intersection(MHS_set.intersection(sample_set))) / float(h)
			MH_jaccards_corrected.append(MH_jaccard)

		# Calculate the containment min hash estimate of the jaccard index
		CMH_jaccards = list()
		for i in range(len(genome_sketches)):
			genome_kmers_len = genome_lengths[i]  # pre-computed when creating the "training" data
			MHS = genome_sketches[i]
			# down sample each sketch to h
			MHS.down_sample(h)
			kmers = MHS._kmers  # use only the k-mers in the min hash sketch
			int_est = 0
			for kmer in kmers:
				if kmer in simulation_bloom:  # test if the k-mers are in the simulation bloom filter
					int_est += 1
			int_est -= p*h  # adjust for false positive rate
			containment_est = int_est / float(h)
			containment_est_jaccard = genome_kmers_len * containment_est / \
				(genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est)
			CMH_jaccards.append(containment_est_jaccard)

		# compute the average deviation from the truth (relative error)
		true_jaccards = np.array(true_jaccards)
		MH_jaccards = np.array(MH_jaccards)
		CMH_jaccards = np.array(CMH_jaccards)
		MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards)/true_jaccards)
		CMH_mean = np.mean(np.abs(true_jaccards - CMH_jaccards)/true_jaccards)
		#print("Classic min hash mean relative error: %f" % MH_mean)
		#print("Containment min hash mean relative error: %f" % CMH_mean)
		MH_relative_errors.append(MH_mean)
		CMH_relative_errors.append(CMH_mean)

	# remove temp files
	os.remove(simulation_file)
	os.remove(abundances_file)
	# return the relative errors
	return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(genome_lengths)
import sys
sys.path.append('/nfs1/Koslicki_Lab/koslickd/Repositories/MinHashMetagenomics/src/')
import os, timeit, h5py
import MinHash as MH
import numpy as np

fid = open('/nfs1/Koslicki_Lab/koslickd/MinHash/Data/FileNames.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]

###############################
# Compute the hashes for all the training genomes
n = 500
CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
# Export
MH.export_multiple_hdf5(CEs, '/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31/')
# Save the hashes in the training genomes
hash_list = set()
for CE in CEs:
    hash_list.update(CE._mins)

fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N'+str(n)+'k31_mins.h5', 'w')
fid.create_dataset("hash_list", data=list(hash_list))
fid.close()
# If I need to read it back in
#fid = h5py.File('/nfs1/Koslicki_Lab/koslickd/MinHash/Out/N500k31_mins.h5','r')
#hash_list = set(fid["hash_list"][:])

n = 5000
CEs = MH.compute_multiple(n=n, max_prime=9999999999971., ksize=31, input_files_list=file_names, save_kmers='y', num_threads=48)
示例#18
0
fid = open(os.path.abspath("../Paper/Data/MetagenomeTotalKmers.txt"), 'r')
metagenome_kmers_total = int(fid.readlines()[0].strip())
fid.close()

# Get virus names
file_names = list()
fid = open(os.path.abspath('../data/Viruses/FileNames.txt'), 'r')
for line in fid.readlines():
    file_names.append(
        os.path.abspath(
            os.path.join('../data/Viruses/', os.path.basename(line.strip()))))
fid.close()

# Get all the hashes so we know the size
base_names = [os.path.basename(item) for item in file_names]
genome_sketches = MH.import_multiple_from_single_hdf5(
    os.path.abspath('../data/Viruses/AllSketches.h5'), base_names)


# query the bloom filter
def count_jaccard(genome_sketch, file_name, ksize, p, max_h,
                  metagenome_kmers_total):
    name = os.path.basename(file_name)
    CMH = genome_sketch
    genome_kmers_len = CMH._true_num_kmers
    cmd = query_per_sequence_loc + " " + metagenome_bloom_filter + " " + \
     os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash" + str(ksize) + "mers.fa"))
    int_est = int(subprocess.check_output(cmd, shell=True))
    int_est -= p * int_est
    containment_est = int_est / float(max_h)
    containment_est_jaccard = genome_kmers_len * containment_est / \
          (genome_kmers_len + metagenome_kmers_total - genome_kmers_len * containment_est)
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc,
                           prime, p, ksize, hash_range):
    # Make a simulation
    simulation_file, abundances_file, selected_genomes = make_simulation(
        num_genomes, num_reads, python_loc, gen_sim_loc)

    # Get simulation k-mers, use canonical k-mers
    # Simultaneously, make the min hash sketch of the simulation
    simulation_kmers = set()
    simulation_MHS = MH.CountEstimator(n=max_h,
                                       max_prime=prime,
                                       ksize=ksize,
                                       save_kmers='y')
    for record in screed.open(simulation_file):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                simulation_kmers.add(kmer)
                simulation_MHS.add(kmer)
            else:
                simulation_kmers.add(kmer_rev)
                simulation_MHS.add(kmer_rev)

    # Use them to populate a bloom filter
    simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers),
                                   error_rate=p)
    simulation_kmers_length = len(
        simulation_kmers
    )  # in practice, this would be computed when the bloom filter is created
    # or can use an estimate based on the bloom filter entries
    for kmer in simulation_kmers:
        simulation_bloom.add(kmer)

    # Use pre-computed data to load the kmers and the sketches
    base_names = [os.path.basename(item) for item in selected_genomes]
    # Load the sketches
    genome_sketches = MH.import_multiple_from_single_hdf5(
        os.path.abspath('../data/Genomes/AllSketches.h5'), base_names)
    # Get the true number of kmers
    genome_lengths = list()
    for i in range(len(genome_sketches)):
        genome_lengths.append(genome_sketches[i]._true_num_kmers)

    # Get *all* the kmers for computation of ground truth
    genome_kmers = list()
    for i in range(len(base_names)):
        name = base_names[i]
        kmers = set()
        fid = bz2.BZ2File(
            os.path.abspath(
                os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r')
        for line in fid.readlines():
            kmers.add(line.strip())
        fid.close()
        genome_kmers.append(kmers)

    # Calculate the true Jaccard index
    true_jaccards = list()
    for kmers in genome_kmers:
        true_jaccard = len(kmers.intersection(simulation_kmers)) / float(
            len(kmers.union(simulation_kmers)))
        true_jaccards.append(true_jaccard)

    # Calculate the min hash estimate of jaccard index
    MH_relative_errors = list()
    CMH_relative_errors = list()
    for h in hash_range:
        MH_jaccards = list()
        for MHS in genome_sketches:
            # Down sample each sketch to h
            MHS.down_sample(h)
            simulation_MHS.down_sample(h)
            MH_jaccard = MHS.jaccard(simulation_MHS)
            MH_jaccards.append(MH_jaccard)

        MH_jaccards_corrected = list()
        for MHS in genome_sketches:
            MHS_set = set(MHS._mins)
            sample_set = set(simulation_MHS._mins)
            MH_jaccard = len(
                set(list(MHS_set.union(sample_set))[0:h]).intersection(
                    MHS_set.intersection(sample_set))) / float(h)
            MH_jaccards_corrected.append(MH_jaccard)

        # Calculate the containment min hash estimate of the jaccard index
        CMH_jaccards = list()
        for i in range(len(genome_sketches)):
            genome_kmers_len = genome_lengths[
                i]  # pre-computed when creating the "training" data
            MHS = genome_sketches[i]
            # down sample each sketch to h
            MHS.down_sample(h)
            kmers = MHS._kmers  # use only the k-mers in the min hash sketch
            int_est = 0
            for kmer in kmers:
                if kmer in simulation_bloom:  # test if the k-mers are in the simulation bloom filter
                    int_est += 1
            int_est -= p * h  # adjust for false positive rate
            containment_est = int_est / float(h)
            containment_est_jaccard = genome_kmers_len * containment_est / \
             (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est)
            CMH_jaccards.append(containment_est_jaccard)

        # compute the average deviation from the truth (relative error)
        true_jaccards = np.array(true_jaccards)
        MH_jaccards = np.array(MH_jaccards)
        CMH_jaccards = np.array(CMH_jaccards)
        MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards)
        CMH_mean = np.mean(
            np.abs(true_jaccards - CMH_jaccards) / true_jaccards)
        #print("Classic min hash mean relative error: %f" % MH_mean)
        #print("Containment min hash mean relative error: %f" % CMH_mean)
        MH_relative_errors.append(MH_mean)
        CMH_relative_errors.append(CMH_mean)

    # remove temp files
    os.remove(simulation_file)
    os.remove(abundances_file)
    # return the relative errors
    return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(
        genome_lengths)
			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	#genome_sketches.append(MHS)
	# export the kmers
	fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w')
	#fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt')  # python3
	for kmer in kmers:
		fid.write("%s\n" % kmer)
	fid.close()
	return MHS


def make_minhash_star(arg):
	return make_minhash(*arg)

pool = Pool(processes=num_threads)
genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
pool.close()
pool.join()
dummy = [len(item._kmers) for item in genome_sketches]  # to get it to actually do the work

# Export all the sketches
base_names = [os.path.basename(item) for item in file_names]
MH.export_multiple_to_single_hdf5(genome_sketches, os.path.abspath('../data/Genomes/AllSketches.h5'))
示例#21
0
import MinHash as MH
import screed


reload(MH)
fid = open('/home/dkoslicki/Dropbox/Repositories/MinHash/data/test_files.txt', 'r')
file_names = fid.readlines()
fid.close()
file_names = [name.strip() for name in file_names]
CE = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[0], save_kmers='y')
CE2 = MH.CountEstimator(n=500, ksize=11, input_file_name=file_names[1], save_kmers='y')
CE2.jaccard_count(CE)
CE2.jaccard(CE)
CE.jaccard(CE2)

CS = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[0])
CS2 = MH.CompositionSketch(n=5000, ksize=11, prefixsize=1, input_file_name=file_names[1])
CS.jaccard_count(CS2)
CS.jaccard(CS2)
CS2.jaccard(CS)


i = 0
for record in screed.open(file_names[0]):
    for kmer in MH.kmers(record.sequence,10):
        if kmer == 'TGGAATTCCA':
            i += 1

print(i)

CE = MH.CountEstimator(n=500, ksize=20, input_file_name=file_names[0], save_kmers='y')
#metagenome_kmers_total = 916485607  # via jellyfish stats on a count bloom filter (need to streamline this)
# Note that this number changed since I am restricting myself to the paired reads (no orphaned guys)
fid = open(os.path.abspath("../Paper/Data/MetagenomeTotalKmers.txt"), 'r')
metagenome_kmers_total = int(fid.readlines()[0].strip())
fid.close()

# Get virus names
file_names = list()
fid = open(os.path.abspath('../data/Viruses/FileNames.txt'), 'r')
for line in fid.readlines():
	file_names.append(os.path.abspath(os.path.join('../data/Viruses/', os.path.basename(line.strip()))))
fid.close()

# Get all the hashes so we know the size
base_names = [os.path.basename(item) for item in file_names]
genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Viruses/AllSketches.h5'), base_names)


# query the bloom filter
def count_jaccard(genome_sketch, file_name, ksize, p, max_h, metagenome_kmers_total):
	name = os.path.basename(file_name)
	CMH = genome_sketch
	genome_kmers_len = CMH._true_num_kmers
	cmd = query_per_sequence_loc + " " + metagenome_bloom_filter + " " + \
		os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash" + str(ksize) + "mers.fa"))
	int_est = int(subprocess.check_output(cmd, shell=True))
	int_est -= p*int_est
	containment_est = int_est / float(max_h)
	containment_est_jaccard = genome_kmers_len * containment_est / \
							(genome_kmers_len + metagenome_kmers_total - genome_kmers_len * containment_est)
	return containment_est_jaccard
示例#23
0
print small_string
print "len(small_string)", len(small_string)
size_A = len(
    set([
        small_string[i:i + ksize]
        for i in range(len(small_string) - ksize + 1)
    ])
)  # size of smaller set, used to convert containment index to Jaccard index
print "size_A", size_A
#large_string = ''.join(np.random.choice(['A', 'C', 'T', 'G'], len_large_string)) + small_string  # large string to form the larger set B
large_string = "CCGCATCGACAAGCAGGATCTGGATCTATTTCTCTCTTAAATCCATGTAAGGGACGGCAGAAACCTGCTCCTTCTACTTGCTACATCTTCTAGGGTAGAACGAGACCAGAGCCGTTACTGCGATATGAAATCAGTACCGAACGTTGGAACTTATTCAGTTTTAACCCGGTCCCCGTCGCCCAAATCGGGCTATATCATACCCCCGGGCCAAGTGTACAAGTGCATCGATTAAATGCACTAACGGCGAAAGTAAATGATGGACTTTCCAAGCCTGAGGTGGTAAACGCACTTGAATAGAGTCGACAAATTATCGGCTGACGATGCCTTGTAGACCAGCTTTAACACATGACCAGTATAGACGAGGCGGAACTAAGCAATCCCAAGTTTTCGTGCGAGCTGAAGGACCCGGCTCCACGAGATAGAGCTTGTGTTAACAAGAGGCCTCCGGCTGGAAAGATTGGTGGAAACGGCTGCTGTCACGTTTGCATCTTACCGGATGTGCCCCAATGAGGAGTTGATGAACTGGCTGTGACGCAATGGCGAAGAGGAAACGTCTGTATGGCGGATGTAACGTTTTTGCAACACTCCTCCACAACTGCTCCTTTAAGATGACCATCACGAAAATGAAGCTCGTTCGAAATCTTCAAAGATCCGGGGTATAATTGCGCTTCCGGGAGAAGGCCATATGCGATAGCGGTAAGTTTCCACAGCGTATCCAAAAGCGGAGCTTTACGATCTCCCCAGTAAACTGGCTTGTGTCAAGCGGCGAACCCGAATTTCGACGAACCTAGATATTCTCTGGCGACTAACTACTATGCGGATGGGCCTATTCGGGGGATTCAGCCCGCGATACTAGAGCGTAATTAGCCTCGCAAGAATCTAGGTAGCCCCAAAATAGCTTGCTAAAGCGCTAGGGTGCACTGCAGGCAAAATCGAGGTGACTGTACCCCGAGCCATGCATATAACTGGGGGGTACCCTTCCAATAATTGTTATCATACCATCTGCATAGACATATTTAACGGCTCAGTAAATTCGTCGCCATGCGACCTCCAGCATGATCGGTGGCACTCCGTTGTGCGCGGACTGTGTAAACCGCACG" + small_string
#print large_string
print "len(large_string):", len(large_string)

# Populate min hash sketch with smaller set
A_MH = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
A_MH.add_sequence(small_string)  # create the min hash of the small string

# Create the bloom filter and populate with the larger set
B_filt = BloomFilter(capacity=1.15 * len_large_string,
                     error_rate=p)  # Initialize the bloom filter
size_B_est = 0  # used to count the number of k-mers in B, could do much more intelligently (like with HyperLogLog)
for i in range(len(large_string) - ksize + 1):
    kmer = large_string[i:i + ksize]
    if kmer not in B_filt:
        size_B_est += 1
        B_filt.add(kmer)

print "size_B_est :", size_B_est

# Use the k-mers in the sketch of A and test if they are in the bloom filter of B
示例#24
0
import sys, os
sys.path.insert(
    0,
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
    "/CMash/CMash")
import MinHash as MH
import itertools
training_database = sys.argv[1]  # first input is the training file name
dump_file = sys.argv[2]  # second input is the desired output dump file
CEs = MH.import_multiple_from_single_hdf5(training_database)
fid = open(dump_file, 'w')
i = 0
for CE in CEs:
    for kmer in CE._kmers:
        fid.write('>seq%d\n' % i)
        fid.write('%s\n' % kmer)
        i += 1

fid.close()