def find_kmer_locations(sequence: str, kmer: str, options: Options = Options()) -> List[int]: # Construct test kmers test_kmers = set() test_kmers.add(kmer) [ test_kmers.add(alt_kmer) for alt_kmer in find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) ] if options.reverse_complement: rc_kmer = reverse_complement(kmer) [ test_kmers.add(alt_rc_kmer) for alt_rc_kmer in find_all_dna_kmers_within_hamming_distance( rc_kmer, options.hamming_distance) ] # Slide over the sequence's kmers and check for matches against test kmers k = len(kmer) idxes = [] for seq_kmer, i in slide_window(sequence, k): if seq_kmer in test_kmers: idxes.append(i) return idxes
def neighborhood(kmer: str) -> Set[str]: neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) return neighbourhood
def kmer_frequency_with_mismatches_and_reverse_complements( data: str, k: int, min_hamming_dist: int) -> Counter[str]: counter = Counter() for i in range(0, len(data) - k + 1): kmer = data[i:i + k] neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, min_hamming_dist) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, min_hamming_dist) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 return counter
def motif_enumeration( dnas: List[str], # dna strings to search in for motif k: int, # k-mer length max_mismatches: int # max num of mismatches for motif (hamming dist) ) -> Set[str]: found_kmers = set() kmers_to_check = set() for dna in dnas: for kmer, _ in slide_window(dna, k): neighbouring_kmers = find_all_dna_kmers_within_hamming_distance( kmer, max_mismatches) kmers_to_check |= neighbouring_kmers for kmer_to_check in kmers_to_check: found_count = 0 for dna in dnas: for other_kmer, _ in slide_window(dna, k): if hamming_distance(kmer_to_check, other_kmer) <= max_mismatches: found_count += 1 break if found_count == len(dnas): found_kmers.add(kmer_to_check) return found_kmers
def count_kmers(data: str, k: int, options: Options = Options()) -> Counter[str]: counter = Counter() for kmer, i in slide_window(data, k): neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 if options.reverse_complement: kmer_rc = reverse_complement(kmer) neighbourhood = find_all_dna_kmers_within_hamming_distance( kmer_rc, options.hamming_distance) for neighbouring_kmer in neighbourhood: counter[neighbouring_kmer] += 1 return counter
import hashlib import textwrap import matplotlib.pyplot as plt from CountASequencesKmersWithMismatchesAndReverseComplement import \ kmer_frequency_with_mismatches_and_reverse_complements from FindAllDnaKmersWithinHammingDistance import find_all_dna_kmers_within_hamming_distance from GCSkew import gc_skew with open('/home/user/Downloads/dataset_240229_4.txt', mode='r', encoding='utf-8') as f: data = f.read() lines = data.split('\n') kmer = lines[0] hamming_dist = int(lines[1]) kmer_variations = find_all_dna_kmers_within_hamming_distance( kmer, hamming_dist) print(f'{" ".join(kmer_variations)}')