def shared_kmers(k, dna1, dna2): '''Returns a list of positions for shared k-mers (up to reverse complement) in dna1 and dna2.''' # Store the starting index of all k-mers from dna1 in a dictionary keyed to the k-mer. dna1_dict = defaultdict(list) for i in xrange(len(dna1) - k + 1): dna1_dict[dna1[i:i + k]].append(i) # Check k-mers in dna2 against those in dna1, add matching index pairs to a set to remove possible duplicate entries. return { (i, j) for j in xrange(len(dna2) - k + 1) for i in dna1_dict[dna2[j:j + k]] + dna1_dict[rev_comp(dna2[j:j + k])] }
def shared_kmers(dna1, dna2, k): '''Returns a list of positions for shared kmers (up to reverse complement) in dna1 and dna2.''' from scripts import ReverseComplementDNA as rev_comp # Initialize the dictionary to store kmers. dna_dict = {} # Store the starting index of all kmers contained in dna1 in a list keyed to the kmer. for i in xrange(len(dna1) - k + 1): # Add the ith kmer. if dna1[i:i + k] in dna_dict: dna_dict[dna1[i:i + k]].append(i) else: dna_dict[dna1[i:i + k]] = [i] # Add the reverse complement of the ith kmer. if rev_comp(dna1[i:i + k]) in dna_dict: dna_dict[rev_comp(dna1[i:i + k])].append(i) else: dna_dict[rev_comp(dna1[i:i + k])] = [i] # Use a set to remove possible duplicate entries. common_kmers = set() # Check kmers in dna2 against those in dna1, adding matching indices to common_kmers. for j in xrange(len(dna2) - k + 1): # Check the jth kmer. if dna2[j:j + k] in dna_dict: for x in dna_dict[dna2[j:j + k]]: common_kmers.add((x, j)) # Check the reverse complement of the jth kmer. if rev_comp(dna2[j:j + k]) in dna_dict: for x in dna_dict[rev_comp(dna2[j:j + k])]: common_kmers.add((x, j)) return common_kmers
def shared_kmers(dna1, dna2, k): '''Returns a list of positions for shared kmers (up to reverse complement) in dna1 and dna2.''' from scripts import ReverseComplementDNA as rev_comp # Initialize the dictionary to store kmers. dna_dict = {} # Store the starting index of all kmers contained in dna1 in a list keyed to the kmer. for i in xrange(len(dna1) - k + 1): # Add the ith kmer. if dna1[i:i+k] in dna_dict: dna_dict[dna1[i:i+k]].append(i) else: dna_dict[dna1[i:i+k]] = [i] # Add the reverse complement of the ith kmer. if rev_comp(dna1[i:i+k]) in dna_dict: dna_dict[rev_comp(dna1[i:i+k])].append(i) else: dna_dict[rev_comp(dna1[i:i+k])] = [i] # Use a set to remove possible duplicate entries. common_kmers = set() # Check kmers in dna2 against those in dna1, adding matching indices to common_kmers. for j in xrange(len(dna2) - k + 1): # Check the jth kmer. if dna2[j:j+k] in dna_dict: for x in dna_dict[dna2[j:j+k]]: common_kmers.add((x,j)) # Check the reverse complement of the jth kmer. if rev_comp(dna2[j:j+k]) in dna_dict: for x in dna_dict[rev_comp(dna2[j:j+k])]: common_kmers.add((x,j)) return common_kmers
def shared_kmers(k, dna1, dna2): '''Returns a list of positions for shared k-mers (up to reverse complement) in dna1 and dna2.''' # Store the starting index of all k-mers from dna1 in a dictionary keyed to the k-mer. dna1_dict = defaultdict(list) for i in xrange(len(dna1) - k + 1): dna1_dict[dna1[i:i+k]].append(i) # Check k-mers in dna2 against those in dna1, add matching index pairs to a set to remove possible duplicate entries. shared_kmer_indices = set() for j in xrange(len(dna2) - k + 1): shared_kmer_indices |= set(map(lambda x: (x,j), dna1_dict[dna2[j:j+k]])) shared_kmer_indices |= set(map(lambda x: (x,j), dna1_dict[rev_comp(dna2[j:j+k])])) return shared_kmer_indices
def freq_words_with_mm_and_rev_comp(seq, k, d): """Returns all most frequent k-mers with up to d mismatches in the dna sequence seq.""" # Frequency analysis so we don't generate mismatches for the same k-mer more than once. kmer_freq = defaultdict(int) for i in xrange(len(seq) - k + 1): kmer_freq[seq[i : i + k]] += 1 kmer_freq[rev_comp(seq[i : i + k])] += 1 # Get all of the mismatches for each unique k-mer in the sequence, appearing freq times. mismatch_count = defaultdict(int) for kmer, freq in kmer_freq.iteritems(): for mismatch in kmer_mismatches(kmer, d): mismatch_count[mismatch] += freq # Computing the maximum value is somewhat time consuming to repeat, so only do it once! max_count = max(mismatch_count.values()) return sorted([kmer for kmer, count in mismatch_count.iteritems() if count == max_count])
def freq_words_with_mm_and_rev_comp(seq, k, d): """Returns all most frequent k-mers with up to d mismatches in the dna sequence seq.""" # Frequency analysis so we don't generate mismatches for the same k-mer more than once. kmer_freq = defaultdict(int) for i in xrange(len(seq) - k + 1): kmer_freq[seq[i:i + k]] += 1 kmer_freq[rev_comp(seq[i:i + k])] += 1 # Get all of the mismatches for each unique k-mer in the sequence, appearing freq times. mismatch_count = defaultdict(int) for kmer, freq in kmer_freq.iteritems(): for mismatch in kmer_mismatches(kmer, d): mismatch_count[mismatch] += freq # Computing the maximum value is somewhat time consuming to repeat, so only do it once! max_count = max(mismatch_count.values()) return sorted([ kmer for kmer, count in mismatch_count.iteritems() if count == max_count ])