def stream_cmash_for_ji(self, other): if self.ksize != other.ksize: raise Exception("different k-mer sizes - cannot compare") if self.p != other.p: raise Exception("different primes - cannot compare") A_kmers = set([x[0:self.ksize] for x in self._kmers]) #unnessary, just a double security A_matches = dict() for kmer in A_kmers: if self.rev_comp: kmer = min(kmer, khmer.reverse_complement(kmer)) A_matches[kmer] = 0 # count purpose # streaming all other kmers for CI (can't do JI directly) for record in screed.open(other.input_file_name): seq = record.sequence seq = seq.upper() seq_split_onlyACTG = re.compile('[^ACTG]').split(seq) for sub_seq in seq_split_onlyACTG: for i in range(len(sub_seq) - self.ksize + 1): # enumerate all kmers kmer = sub_seq[i:i + self.ksize] if self.rev_comp: kmer = min(kmer, khmer.reverse_complement(kmer)) if kmer in A_matches: A_matches[kmer] = 1 # return results C_est = np.sum(list(A_matches.values())) / len(A_kmers) J_est = containment_to_jaccard(C_est, self, other) print(C_est) print(J_est) return J_est
def add(self, kmer, weight, rev_comp): _mins = self._mins _counts = self._counts _kmers = self._kmers # use rev_comp if needed if rev_comp: h1 = khmer.hash_no_rc_murmur3(kmer) h2 = khmer.hash_no_rc_murmur3(khmer.reverse_complement(kmer)) h = min(h1, h2) if h == h2: kmer = khmer.reverse_complement(kmer) else: h = khmer.hash_no_rc_murmur3(kmer) # reminder of max_prime we use h = h % self.p # early stop if n sketches are found if h >= _mins[-1]: return # insert kmer into the sketch i = bisect.bisect_left(_mins, h) # find index to insert h if _mins[i] == h: #already in sketch _counts[i] += weight else: #h not in sketch, insert _mins.insert(i, h) _counts.insert(i, weight) _kmers.insert(i, kmer) _mins.pop() _counts.pop() _kmers.pop() return
def canonical_kmer(kmer): """ transfer an input kmer into its canomical form :param kmer: :return: """ h1 = khmer.hash_no_rc_murmur3(kmer) h2 = khmer.hash_no_rc_murmur3(khmer.reverse_complement(kmer)) if h1 > h2: kmer = khmer.reverse_complement(kmer) return kmer
def test_reverse_complement(): s = 'AATTCCGG' assert khmer.reverse_complement(s) == 'CCGGAATT' s = 'A' assert khmer.reverse_complement(s) == 'T' s = 'T' assert khmer.reverse_complement(s) == 'A' s = 'C' assert khmer.reverse_complement(s) == 'G' s = 'G' assert khmer.reverse_complement(s) == 'C'
def add(self, kmer, update_full=False): _mins = self._mins _kmers = self._kmers # use rev_comp if needed if self.rev_comp: kmer = min(kmer, khmer.reverse_complement(kmer)) h = khmer.hash_no_rc_murmur3(kmer) # insert into full kmer set if update_full: _full = self._all_kmer if kmer not in _full: _full[kmer] = 1 else: _full[kmer] += 1 # insert into MH sketches reminder of max_prime we use h = h % self.p # early stop if n sketches are found if h >= _mins[-1]: return # insert kmer into the sketch i = bisect.bisect_left(_mins, h) # find index to insert h if _mins[i] == h: # already in sketch return else: # h not in sketch, insert _mins.insert(i, h) _kmers.insert(i, kmer) _mins.pop() _kmers.pop() return
def brute_force_truncation(self, new_ksize): if not isinstance(new_ksize, int): raise Exception("Input number is not an integer") if new_ksize > self.ksize: raise Exception("New size must be smaller than %d." % self.ksize) elif new_ksize == self.ksize: return elif new_ksize < self.ksize: # data to be updated after the truncation: self.ksize = new_ksize self.cardinality = estimate_genome_size(self.input_file_name, new_ksize) while self._mins[-1] == self.p: # rm unused cells, otherwise empty cell (though very rare) has hash value 0 self._mins.pop() self._kmers.pop() new_kmers = list(set([x[0:new_ksize] for x in self._kmers])) sketch_size = len(new_kmers) self._mins = [self.p] * sketch_size self._kmers = [''] * sketch_size # update for i in range(sketch_size): self.add(new_kmers[i]) # for MH sketch only # clean trailing empty cells in sketches while self._mins[-1] == self.p: self._mins.pop() self._kmers.pop() # conditional: truncate the full kmer to current ksize if self.full_kmer: old_kmers = [x[0:new_ksize] for x in self._all_kmer] if self.rev_comp: old_kmers = [min(x, khmer.reverse_complement(x)) for x in old_kmers] self._truncated_all_kmer = list(set(old_kmers)) return
def define_canonical_kmers(cg, nkmers): """ Define canonical k-mers, i.e. exclude palindromic and rev. compl. k-mers Parameters ---------- cg : khmer.Countgraph a k-mer countgraph nkmers : int number of all possible k-mers Returns ------- set a set of canonical k-mers """ canonical_kmers = set( ) # TODO Consider a sorting step to guarantee order of canonical kmers/kmer-hashes for i in range(nkmers): kmer = cg.reverse_hash(i) kmer_rev_comp = khmer.reverse_complement(kmer) # Store only the lexicographically *smaller* kmer or the palindromic kmer if kmer < kmer_rev_comp or kmer == kmer_rev_comp: canonical_kmers.add(i) else: continue return canonical_kmers
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) #genome_sketches.append(MHS) # export the kmers fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w') #fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt') # python3 for kmer in kmers: fid.write("%s\n" % kmer) fid.close() return MHS
def return_matches(self, input_kmer: str, k_size_loc: int) -> tuple: """ Get all the matches in the TST with the kmer prefix :param input_kmer: an input k-mer :type input_kmer: str :param k_size_loc: where in self.k_range this k-mer (via it's length) belongs :type k_size_loc: int :return: a tuple: first of which is a list of strings (all the matches in the TST), and the second is a Boolean indicating if you saw a match :rtype: tuple """ match_info = set() to_return = [] saw_match = False tree = self.tree # look for matches to both the kmer and its reverse complement in the TST as we can't assume # directionality of reads (and training database is constructed without reverse complements) for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]: prefix_matches = tree.keys( kmer) # get all the k-mers whose prefix matches # get the location of the found kmers in the counters for item in prefix_matches: split_string = item.split( 'x') # first is the hash location, second is which k-mer hash_loc = int(split_string[1]) kmer_loc = int(split_string[2]) match_info.add((hash_loc, k_size_loc, kmer_loc)) saw_match = False if match_info: saw_match = True for tup in match_info: to_return.append(tup) if saw_match: # Only need to see a match to the original kmer or the reverse complement, don't return both otherwise you over-count break return to_return, saw_match
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) # Export the hash k-mers fid = open( os.path.abspath( os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w') for kmer in MHS._kmers: fid.write(">\n%s\n" % kmer) fid.close() return MHS
def add(self, kmer, rev_comp=False): """ Add kmer into sketch, keeping sketch sorted, update counts accordingly """ _mins = self._mins _counts = self._counts _kmers = self._kmers if rev_comp: h1 = khmer.hash_murmur3(kmer) h2 = khmer.hash_murmur3(khmer.reverse_complement(kmer)) #h1 = hash(kmer) #h2 = hash(khmer.reverse_complement(kmer)) h = min(h1, h2) if h == h2: kmer = khmer.reverse_complement(kmer) else: h = khmer.hash_murmur3(kmer) #h = hash(kmer) h = h % self.p if self.hash_list: # If I only want to include hashes that occur in hash_list if h not in self.hash_list: # If the kmer isn't in the hash_list, then break return if h >= _mins[-1]: return i = bisect.bisect_left(_mins, h) # find index to insert h if _mins[i] == h: # if h in mins, increment counts _counts[i] += 1 return else: # otherwise insert h, initialize counts to 1, and insert kmer if necessary _mins.insert(i, h) _mins.pop() _counts.insert(i, 1) _counts.pop() if _kmers: _kmers.insert(i, np.string_(kmer)) _kmers.pop() return assert 0, "should never reach this"
def test_Counters_return_matches(): C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() C.create_BF_prefilter() counters = Counters(tree=C.tree, k_range=k_range, all_kmers_bf=C.all_kmers_bf) # test the return matches on known k-mers # each sketch kmer (or it's reverse complement) should match to the TST # TODO: big note here: proper way to check this: take the reverse complement, THEN truncate # (which effectively takes the suffix, as the suffix of a rev-comp is the prefix of the original) # but this calls into question how create_BF_prefilter is working since it truncates, THEN takes the revcomp # but this is the only way I could get all these tests to pass successfully for CE in CEs: for k_size in k_range: for kmer in CE._kmers: kmer = kmer[0:k_size] if kmer: k_size_loc = k_range.index(len(kmer)) to_return, saw_match = counters.return_matches( input_kmer=kmer, k_size_loc=k_size_loc) assert saw_match for to_return_elem in to_return: truncated_sketches = list( map(lambda x: x[0:k_size], CEs[to_return_elem[0]]._kmers)) # add the reverse complements as well, since the TST return_matches matches to rev-comps as well truncated_sketches_revcomp = list( map( lambda x: khmer.reverse_complement(x)[ 0:k_size], CEs[to_return_elem[0]]._kmers)) # make sure the kmer really is in the sketch indicated by to_return, could be in the truncated or the rev-comp one assert (kmer in truncated_sketches) or ( kmer in truncated_sketches_revcomp) # make sure the k_size_loc is correct assert to_return_elem[1] == k_size_loc # make sure it returned the correct location in the sketch # note that at some smaller kmer values, it may appear in multiple locations, so just make sure # that it appears somewhere in the list indices = [ i for i, x in enumerate(truncated_sketches) if x == kmer ] indices_revcomp = [ i for i, x in enumerate(truncated_sketches_revcomp) if x == kmer ] assert (to_return_elem[2] in indices) or (to_return_elem[2] in indices_revcomp)
def get_all_kmers(input_file, temp_k, use_rev_comp=True): temp_dict = dict() for record in screed.open(input_file): for kmer in kmers(record.sequence, temp_k): if use_rev_comp: kmer = min(kmer, khmer.reverse_complement(kmer)) if kmer in temp_dict: temp_dict[kmer] += 1 else: temp_dict[kmer] = 1 return temp_dict
def test_Create_BF_prefilter(): C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() C.create_BF_prefilter() # Make sure each TST kmers has been inserted into the bloom tree for kmer_with_info in C.tree.keys(): kmer = kmer_with_info.split('x')[0] assert kmer in C.all_kmers_bf # Make sure all the reverse complements are in there too for kmer_with_info in C.tree.keys(): kmer = kmer_with_info.split('x')[0] kmer = khmer.reverse_complement(kmer) assert kmer in C.all_kmers_bf # go through each individual sequence in the sketches and make sure them and their rev-comps are in the BF for CE in CEs: for kmer in CE._kmers: if kmer: for k_size in k_range: assert kmer[0:k_size] in C.all_kmers_bf assert khmer.reverse_complement( kmer[0:k_size]) in C.all_kmers_bf # check if the BF is case insensitive for CE in CEs: for kmer in CE._kmers: if kmer: for k_size in k_range: trunc_kmer = kmer[0:k_size] trunc_kmer = trunc_kmer.lower() assert trunc_kmer in C.all_kmers_bf # khmer doesn't properly handle rev-comps of lower-case characters # see https://github.com/dib-lab/khmer/issues/1904 assert khmer.reverse_complement( trunc_kmer.upper()).lower() in C.all_kmers_bf
def process_seq(self, seq: str) -> list: """ Takes an input sequence, breaks it into its k-mers (for every size self.k_range), and after some filtering and checking, sends it to return_matches to query the TST :param seq: an input DNA sequence :type seq: string :return: a list of keys indicating all the TST hits for all the k-mers in seq :rtype: list """ k_range = self.k_range seen_kmers = self.seen_kmers all_kmers_bf = self.all_kmers_bf # start with small kmer size, if see match, then continue looking for longer k-mer sizes, otherwise move on small_k_size = k_range[0] # start with the small k-size to_return = [] seq = seq.upper() # TODO: could, for efficiency, also remove non-ACTG, but those won't match anyways since they aren't in the TST # might not actually be more efficient to search for non-ACTG too for i in range(len(seq) - small_k_size + 1): # look at all k-mers kmer = seq[i:i + small_k_size] possible_match = False if kmer not in seen_kmers: # if we should process it if kmer in all_kmers_bf: # if we should process it match_list, saw_match = self.return_matches(kmer, 0) if saw_match: seen_kmers.add(kmer) seen_kmers.add(khmer.reverse_complement(kmer)) to_return.extend(match_list) possible_match = True # TODO: note: I could (since it'd only be for a single kmer size, keep a set of *all* small_kmers I've tried and use this as another pre-filter else: possible_match = True # FIXME: bug introduced here in cf64b7aace5eadf738b920109d6419c9d930a1dc, make sure it didn't happen again # start looking at the other k_sizes, don't overhang len(seq) if possible_match: for other_k_size in [ x for x in k_range[1:] if i + x <= len(seq) ]: kmer = seq[i:i + other_k_size] if kmer in all_kmers_bf: # if True: k_size_loc = k_range.index(other_k_size) match_list, saw_match = self.return_matches( kmer, k_size_loc) if saw_match: to_return.extend(match_list) else: pass # if you didn't see a match at a smaller k-length, you won't at a larger one return to_return
def create_BF_prefilter(self, result_file=None) -> None: """ Imports or creates the pre-filter Bloom filter :param result_file: (optional) if you'd like to export the bloom filter, populate that here :type result_file: str """ tree = self.tree k_range = self.k_range if not self.bloom_filter_file: # create one try: # Get all the k-mers in the TST, put them in a bloom filter # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01) if result_file: # save it to the file self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True, filename=result_file ) # fudge factor of 5 will make the BF larger, but also slightly faster else: # keep it in memory self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True ) # fudge factor of 5 will make the BF larger, but also slightly faster for kmer_info in tree.keys(): kmer = kmer_info.split( 'x' )[0] # remove the location information and just get the kmer for ksize in k_range: self.all_kmers_bf.add(kmer[0:ksize]) self.all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize])) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1) else: # otherwise read it in try: self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1)
def calculate_bias_factor(self, other): """ Calculate the bias factor from 2 JI_CE object, need to be truncated first Will use: 2 truncated full kmer, 2 full kmer, maxk, current ksize """ if self.ksize != other.ksize: raise Exception("different k-mer sizes - cannot compare") if self.p != other.p: raise Exception("different primes - cannot compare") if self.maxk != other.maxk: raise Exception("different maxk - cannot compare") if not self.full_kmer or not other.full_kmer: raise Exception("full kmer not enabled for the CE object") # use dict to count prefix ksmall_intersect = dict() for kmer in list(set(self._truncated_all_kmer).intersection(other._truncated_all_kmer)): ksmall_intersect[kmer] = 0 # for counting purpose ksmall_union = dict() for kmer in list(set(self._truncated_all_kmer).union(other._truncated_all_kmer)): ksmall_union[kmer] = 0 # count prefix match for kmer in list(set(self._all_kmer.keys()).union(other._all_kmer.keys())): kmer = kmer[0:self.ksize] # prefix if self.rev_comp: kmer = min(kmer, khmer.reverse_complement(kmer)) if kmer in ksmall_intersect: ksmall_intersect[kmer] += 1 ksmall_union[kmer] += 1 elif kmer in ksmall_union: ksmall_union[kmer] += 1 # bias factor if len(ksmall_intersect) == 0: numerator = 0 else: numerator = sum(ksmall_intersect.values()) * 1.0 / len(ksmall_intersect) denominator = sum(ksmall_union.values()) * 1.0 / len(ksmall_union) bias_factor = numerator / denominator print(numerator) print(denominator) return bias_factor
def yield_trie_items_to_insert_no_import(file_name): fid = h5py.File(file_name, 'r') if "CountEstimators" not in fid: fid.close() raise Exception( "This function imports a single HDF5 file containing multiple sketches." " It appears you've used it on a file containing a single sketch." "Try using import_single_hdf5 instead") grp = fid["CountEstimators"] iterator = grp.keys() iterator = sorted(iterator, key=os.path.basename ) # sort so that we know the order of the input for (i, key) in enumerate(iterator): if key not in grp: fid.close() raise Exception("The key " + key + " is not in " + file_name) subgrp = grp[key] if "kmers" not in subgrp: raise Exception( "Kmers were not saved when creating the count estimators. Please make sure save_kmers='y' " "when creating the count estimators.") else: temp_kmers = subgrp["kmers"][...] kmers = [kmer.decode('utf-8') for kmer in temp_kmers] for (kmer_index, kmer) in enumerate(kmers): # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement if kmer: yield kmer + 'x' + str(i) + 'x' + str( kmer_index ) # format here is kmer+x+hash_index+kmer_index # rev-comp kmer kmer_rc = khmer.reverse_complement(kmer) yield kmer_rc + 'x' + str(i) + 'x' + str( kmer_index ) # format here is kmer+x+hash_index+kmer_index
def make_minhash(genome, max_h, prime, ksize): kmers = set() name = os.path.basename(genome) MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(genome): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: kmers.add(kmer) MHS.add(kmer) else: kmers.add(kmer_rev) MHS.add(kmer_rev) MHS._true_num_kmers = len(kmers) MHS.input_file_name = os.path.basename(genome) # Export the hash k-mers fid = open(os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w') for kmer in MHS._kmers: fid.write(">\n%s\n" % kmer) fid.close() return MHS
def make_TST(self): genome_sketches = self.genome_sketches to_insert = set() # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement for i in range(len(genome_sketches)): for kmer_index in range(len(genome_sketches[i]._kmers)): # normal kmer kmer = genome_sketches[i]._kmers[kmer_index] # only insert the kmer if it's actually non-empty if kmer: to_insert.add( kmer + 'x' + str(i) + 'x' + str(kmer_index) ) # format here is kmer+x+hash_index+kmer_index # rev-comp kmer kmer = khmer.reverse_complement( genome_sketches[i]._kmers[kmer_index]) to_insert.add( kmer + 'x' + str(i) + 'x' + str(kmer_index) ) # format here is kmer+x+hash_index+kmer_index # export the TST tree = mt.Trie(to_insert) tree.save(self.TST_export_file_name)
def return_matches(self, input_kmer, k_size_loc): """ Get all the matches in the trie with the kmer prefix""" match_info = set() to_return = [] for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]: prefix_matches = tree.keys( kmer) # get all the k-mers whose prefix matches #match_info = set() # get the location of the found kmers in the counters for item in prefix_matches: split_string = item.split( 'x' ) # first is the hash location, second is which k-mer hash_loc = int(split_string[1]) kmer_loc = int(split_string[2]) match_info.add((hash_loc, k_size_loc, kmer_loc)) #to_return = [] saw_match = False if match_info: saw_match = True for tup in match_info: to_return.append(tup) return to_return, saw_match
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index tree = mt.Trie(to_insert) tree.save(streaming_database_file) else: tree = mt.Trie() tree.load(streaming_database_file) # all the k-mers of interest in a set (as a pre-filter) if not hydra_file: # create one try: all_kmers_bf = WritingBloomFilter(len(sketches)*len(k_range)*num_hashes*2, 0.01) for sketch in sketches: for kmer in sketch._kmers: for ksize in k_range: all_kmers_bf.add(kmer[0:ksize]) # put all the k-mers and the appropriate suffixes in all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize])) # also add the reverse complement except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) else: # otherwise read it in try: all_kmers_bf = ReadingBloomFilter(hydra_file) except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) if verbose: print("Finished reading in/creating ternary search tree") t1 = timeit.default_timer() print("Time: %f" % (t1 - t0)) # Seen k-mers (set of k-mers that already hit the trie, so don't need to check again) seen_kmers = set()
def test_reverse_complement_exception(): # deal with DNA, ignore rest assert khmer.reverse_complement('FGF') == 'FCF'
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range): # Make a simulation simulation_file, abundances_file, selected_genomes = make_simulation(num_genomes, num_reads, python_loc, gen_sim_loc) # Get simulation k-mers, use canonical k-mers # Simultaneously, make the min hash sketch of the simulation simulation_kmers = set() simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(simulation_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i+ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: simulation_kmers.add(kmer) simulation_MHS.add(kmer) else: simulation_kmers.add(kmer_rev) simulation_MHS.add(kmer_rev) # Use them to populate a bloom filter simulation_bloom = BloomFilter(capacity=1.1*len(simulation_kmers), error_rate=p) simulation_kmers_length = len(simulation_kmers) # in practice, this would be computed when the bloom filter is created # or can use an estimate based on the bloom filter entries for kmer in simulation_kmers: simulation_bloom.add(kmer) # Use pre-computed data to load the kmers and the sketches base_names = [os.path.basename(item) for item in selected_genomes] # Load the sketches genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Genomes/AllSketches.h5'), base_names) # Get the true number of kmers genome_lengths = list() for i in range(len(genome_sketches)): genome_lengths.append(genome_sketches[i]._true_num_kmers) # Get *all* the kmers for computation of ground truth genome_kmers = list() for i in range(len(base_names)): name = base_names[i] kmers = set() fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r') for line in fid.readlines(): kmers.add(line.strip()) fid.close() genome_kmers.append(kmers) # Calculate the true Jaccard index true_jaccards = list() for kmers in genome_kmers: true_jaccard = len(kmers.intersection(simulation_kmers)) / float(len(kmers.union(simulation_kmers))) true_jaccards.append(true_jaccard) # Calculate the min hash estimate of jaccard index MH_relative_errors = list() CMH_relative_errors = list() for h in hash_range: MH_jaccards = list() for MHS in genome_sketches: # Down sample each sketch to h MHS.down_sample(h) simulation_MHS.down_sample(h) MH_jaccard = MHS.jaccard(simulation_MHS) MH_jaccards.append(MH_jaccard) MH_jaccards_corrected = list() for MHS in genome_sketches: MHS_set = set(MHS._mins) sample_set = set(simulation_MHS._mins) MH_jaccard = len(set(list(MHS_set.union(sample_set))[0:h]).intersection(MHS_set.intersection(sample_set))) / float(h) MH_jaccards_corrected.append(MH_jaccard) # Calculate the containment min hash estimate of the jaccard index CMH_jaccards = list() for i in range(len(genome_sketches)): genome_kmers_len = genome_lengths[i] # pre-computed when creating the "training" data MHS = genome_sketches[i] # down sample each sketch to h MHS.down_sample(h) kmers = MHS._kmers # use only the k-mers in the min hash sketch int_est = 0 for kmer in kmers: if kmer in simulation_bloom: # test if the k-mers are in the simulation bloom filter int_est += 1 int_est -= p*h # adjust for false positive rate containment_est = int_est / float(h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est) CMH_jaccards.append(containment_est_jaccard) # compute the average deviation from the truth (relative error) true_jaccards = np.array(true_jaccards) MH_jaccards = np.array(MH_jaccards) CMH_jaccards = np.array(CMH_jaccards) MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards)/true_jaccards) CMH_mean = np.mean(np.abs(true_jaccards - CMH_jaccards)/true_jaccards) #print("Classic min hash mean relative error: %f" % MH_mean) #print("Containment min hash mean relative error: %f" % CMH_mean) MH_relative_errors.append(MH_mean) CMH_relative_errors.append(CMH_mean) # remove temp files os.remove(simulation_file) os.remove(abundances_file) # return the relative errors return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(genome_lengths)
def test_kmer_revcom_hash(kmer): a = khmer.Counttable(21, 1e4, 3) assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer))
tree = mt.Trie() tree.load(streaming_database_file) # all the k-mers of interest in a set (as a pre-filter) if not hydra_file: # create one try: all_kmers_bf = WritingBloomFilter( len(sketches) * len(k_range) * num_hashes * 2, 0.01) for sketch in sketches: for kmer in sketch._kmers: for ksize in k_range: all_kmers_bf.add( kmer[0:ksize] ) # put all the k-mers and the appropriate suffixes in all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize]) ) # also add the reverse complement except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) else: # otherwise read it in try: all_kmers_bf = ReadingBloomFilter(hydra_file) except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) if verbose: print("Finished reading in/creating ternary search tree") t1 = timeit.default_timer()
def test_reverse_complement_exception(): with pytest.raises(RuntimeError): khmer.reverse_complement('FGF')
temp_database_file = tempfile.mktemp() MH.export_multiple_to_single_hdf5(CEs, temp_database_file) # And create the TST to_insert = set() # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement for i in range(len(CEs)): for kmer_index in range(len(CEs[i]._kmers)): # normal kmer kmer = CEs[i]._kmers[kmer_index] if kmer: to_insert.add( kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index # rev-comp kmer kmer = khmer.reverse_complement(CEs[i]._kmers[kmer_index]) to_insert.add( kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index # export the TST tree = mt.Trie(to_insert) temp_TST_file = tempfile.mktemp() tree.save(temp_TST_file) # TODO: marisa_trie has an issue with single character prefix lookups # TODO: see https://github.com/pytries/marisa-trie/issues/55 # TODO: so set k-range above that k_range = [2, 3, 5]
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range): # Make a simulation simulation_file, abundances_file, selected_genomes = make_simulation( num_genomes, num_reads, python_loc, gen_sim_loc) # Get simulation k-mers, use canonical k-mers # Simultaneously, make the min hash sketch of the simulation simulation_kmers = set() simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y') for record in screed.open(simulation_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] kmer_rev = khmer.reverse_complement(kmer) if kmer < kmer_rev: simulation_kmers.add(kmer) simulation_MHS.add(kmer) else: simulation_kmers.add(kmer_rev) simulation_MHS.add(kmer_rev) # Use them to populate a bloom filter simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers), error_rate=p) simulation_kmers_length = len( simulation_kmers ) # in practice, this would be computed when the bloom filter is created # or can use an estimate based on the bloom filter entries for kmer in simulation_kmers: simulation_bloom.add(kmer) # Use pre-computed data to load the kmers and the sketches base_names = [os.path.basename(item) for item in selected_genomes] # Load the sketches genome_sketches = MH.import_multiple_from_single_hdf5( os.path.abspath('../data/Genomes/AllSketches.h5'), base_names) # Get the true number of kmers genome_lengths = list() for i in range(len(genome_sketches)): genome_lengths.append(genome_sketches[i]._true_num_kmers) # Get *all* the kmers for computation of ground truth genome_kmers = list() for i in range(len(base_names)): name = base_names[i] kmers = set() fid = bz2.BZ2File( os.path.abspath( os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r') for line in fid.readlines(): kmers.add(line.strip()) fid.close() genome_kmers.append(kmers) # Calculate the true Jaccard index true_jaccards = list() for kmers in genome_kmers: true_jaccard = len(kmers.intersection(simulation_kmers)) / float( len(kmers.union(simulation_kmers))) true_jaccards.append(true_jaccard) # Calculate the min hash estimate of jaccard index MH_relative_errors = list() CMH_relative_errors = list() for h in hash_range: MH_jaccards = list() for MHS in genome_sketches: # Down sample each sketch to h MHS.down_sample(h) simulation_MHS.down_sample(h) MH_jaccard = MHS.jaccard(simulation_MHS) MH_jaccards.append(MH_jaccard) MH_jaccards_corrected = list() for MHS in genome_sketches: MHS_set = set(MHS._mins) sample_set = set(simulation_MHS._mins) MH_jaccard = len( set(list(MHS_set.union(sample_set))[0:h]).intersection( MHS_set.intersection(sample_set))) / float(h) MH_jaccards_corrected.append(MH_jaccard) # Calculate the containment min hash estimate of the jaccard index CMH_jaccards = list() for i in range(len(genome_sketches)): genome_kmers_len = genome_lengths[ i] # pre-computed when creating the "training" data MHS = genome_sketches[i] # down sample each sketch to h MHS.down_sample(h) kmers = MHS._kmers # use only the k-mers in the min hash sketch int_est = 0 for kmer in kmers: if kmer in simulation_bloom: # test if the k-mers are in the simulation bloom filter int_est += 1 int_est -= p * h # adjust for false positive rate containment_est = int_est / float(h) containment_est_jaccard = genome_kmers_len * containment_est / \ (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est) CMH_jaccards.append(containment_est_jaccard) # compute the average deviation from the truth (relative error) true_jaccards = np.array(true_jaccards) MH_jaccards = np.array(MH_jaccards) CMH_jaccards = np.array(CMH_jaccards) MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards) CMH_mean = np.mean( np.abs(true_jaccards - CMH_jaccards) / true_jaccards) #print("Classic min hash mean relative error: %f" % MH_mean) #print("Containment min hash mean relative error: %f" % CMH_mean) MH_relative_errors.append(MH_mean) CMH_relative_errors.append(CMH_mean) # remove temp files os.remove(simulation_file) os.remove(abundances_file) # return the relative errors return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean( genome_lengths)