Пример #1
0
def motif_enumeration(
    dnas: List[str],  # dna strings to search in for motif
    k: int,  # k-mer length
    max_mismatches: int  # max num of mismatches for motif (hamming dist)
) -> Set[str]:
    found_kmers = set()

    kmers_to_check = set()
    for dna in dnas:
        for kmer, _ in slide_window(dna, k):
            neighbouring_kmers = find_all_dna_kmers_within_hamming_distance(
                kmer, max_mismatches)
            kmers_to_check |= neighbouring_kmers

    for kmer_to_check in kmers_to_check:
        found_count = 0
        for dna in dnas:
            for other_kmer, _ in slide_window(dna, k):
                if hamming_distance(kmer_to_check,
                                    other_kmer) <= max_mismatches:
                    found_count += 1
                    break
        if found_count == len(dnas):
            found_kmers.add(kmer_to_check)

    return found_kmers
Пример #2
0
def scan_for_repeating_kmers_in_clusters(
    sequence: str,
    k: int,
    min_occurrence_in_cluster: int,
    cluster_window_size: int,
    options: Options = Options()) -> Set[KmerCluster]:
    def neighborhood(kmer: str) -> Set[str]:
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
        if options.reverse_complement:
            kmer_rc = reverse_complement(kmer)
            neighbourhood = find_all_dna_kmers_within_hamming_distance(
                kmer_rc, options.hamming_distance)
        return neighbourhood

    kmer_counter = {}

    def add_kmer(kmer: str, loc: int) -> None:
        if kmer not in kmer_counter:
            kmer_counter[kmer] = set()
        kmer_counter[kmer].add(window_idx + kmer_idx)

    def remove_kmer(kmer: str, loc: int) -> None:
        kmer_counter[kmer].remove(window_idx - 1)
        if len(kmer_counter[kmer]) == 0:
            del kmer_counter[kmer]

    clustered_kmers = set()

    old_first_kmer = None
    for window, window_idx in slide_window(sequence, cluster_window_size):
        first_kmer = window[0:k]
        last_kmer = window[-k:]

        # If first iteration, add all kmers
        if window_idx == 0:
            for kmer, kmer_idx in slide_window(window, k):
                for alt_kmer in neighborhood(kmer):
                    add_kmer(alt_kmer, window_idx + kmer_idx)
        else:
            # Add kmer that was walked in to
            for new_last_kmer in neighborhood(last_kmer):
                add_kmer(new_last_kmer, window_idx + cluster_window_size - k)
            # Remove kmer that was walked out of
            if old_first_kmer is not None:
                for alt_kmer in neighborhood(old_first_kmer):
                    remove_kmer(alt_kmer, window_idx - 1)

        old_first_kmer = first_kmer

        # Find clusters within window -- tuple is k-mer, start_idx, occurrence_count
        [
            clustered_kmers.add(KmerCluster(k, min(v), len(v)))
            for k, v in kmer_counter.items()
            if len(v) >= min_occurrence_in_cluster
        ]

    return clustered_kmers
Пример #3
0
 def shatter(self: ReadPair, k: int) -> List[ReadPair]:
     ret = []
     d = (self.k - k) + self.d
     for window_head, window_tail in zip(slide_window(self.data.head, k), slide_window(self.data.tail, k)):
         kmer_head, _ = window_head
         kmer_tail, _ = window_tail
         kdmer = Kdmer(kmer_head, kmer_tail, d)
         rp = ReadPair(kdmer, source=('shatter', [self]))
         ret.append(rp)
     return ret
Пример #4
0
def find_kmer_locations(sequence: str, kmer: str,
                        options: Options = Options()) -> List[int]:
    # Construct test kmers
    test_kmers = set()
    test_kmers.add(kmer)
    [
        test_kmers.add(alt_kmer)
        for alt_kmer in find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
    ]
    if options.reverse_complement:
        rc_kmer = reverse_complement(kmer)
        [
            test_kmers.add(alt_rc_kmer)
            for alt_rc_kmer in find_all_dna_kmers_within_hamming_distance(
                rc_kmer, options.hamming_distance)
        ]

    # Slide over the sequence's kmers and check for matches against test kmers
    k = len(kmer)
    idxes = []
    for seq_kmer, i in slide_window(sequence, k):
        if seq_kmer in test_kmers:
            idxes.append(i)
    return idxes
Пример #5
0
 def from_string(text: str, k: int, instantize: bool = False):
     counter = Counter()
     ret = []
     for kmer, _ in slide_window(text, k):
         instance = counter[kmer] if instantize else 0
         ret.append(Read(kmer, instance=instance))
         counter[kmer] += 1
     return ret
def determine_probabilities_of_all_kmers_in_dna(
        profile_matrix: Dict[str, List[float]], dna: str, k: int) -> List[int]:
    ret = []
    for kmer, _ in slide_window(dna, k):
        prob = determine_probability_of_match_using_profile_matrix(
            profile_matrix, kmer)
        ret.append(prob)
    return ret
def find_most_probable_kmer_using_profile_matrix(profile: Dict[str, List[float]], dna: str):
    k = len(list(profile.values())[0])

    most_probable: Tuple[str, float] = None  # [kmer, probability]
    for kmer, _ in slide_window(dna, k):
        prob = determine_probability_of_match_using_profile_matrix(profile, kmer)
        if most_probable is None or prob > most_probable[1]:
            most_probable = (kmer, prob)

    return most_probable
Пример #8
0
def distance_between_pattern_and_strings(pattern: str, dnas: List[str]) -> int:
    min_hds = []

    k = len(pattern)
    for dna in dnas:
        min_hd = None
        for dna_kmer, _ in slide_window(dna, k):
            hd = hamming_distance(pattern, dna_kmer)
            if min_hd is None or hd < min_hd:
                min_hd = hd
        min_hds.append(min_hd)
    return sum(min_hds)
Пример #9
0
def find_peptide_encodings_in_dna(dna: str, amino_acid_seq: str) -> List[str]:
    ret = []
    for kmer, _ in slide_window(dna, len(amino_acid_seq) * 3):
        rna_kmer = dna_to_rna(kmer)
        rna_kmer_rev_comp = dna_to_rna(dna_reverse_complement(kmer))
        found = False
        for rna in [rna_kmer, rna_kmer_rev_comp]:
            amino_acids = [codon_to_amino_acid(codon) for codon in split_to_size(rna, 3)]
            if None in amino_acids:
                continue
            if ''.join(amino_acids) == amino_acid_seq:
                found = True
                break
        if found:
            ret.append(kmer)
    return ret
Пример #10
0
def count_kmers(data: str, k: int,
                options: Options = Options()) -> Counter[str]:
    counter = Counter()
    for kmer, i in slide_window(data, k):
        neighbourhood = find_all_dna_kmers_within_hamming_distance(
            kmer, options.hamming_distance)
        for neighbouring_kmer in neighbourhood:
            counter[neighbouring_kmer] += 1

        if options.reverse_complement:
            kmer_rc = reverse_complement(kmer)
            neighbourhood = find_all_dna_kmers_within_hamming_distance(
                kmer_rc, options.hamming_distance)
            for neighbouring_kmer in neighbourhood:
                counter[neighbouring_kmer] += 1

    return counter
Пример #11
0
def greedy_motif_search(k: int, dnas: List[str]):
    best_motif_matrix = [dna[0:k] for dna in dnas]

    for motif, _ in slide_window(dnas[0], k):
        motif_matrix = [motif]
        counts = motif_matrix_count(motif_matrix)
        profile = motif_matrix_profile(counts)

        for dna in dnas[1:]:
            next_motif, _ = find_most_probable_kmer_using_profile_matrix(
                profile, dna)
            # push in closest kmer as a motif member and recompute profile for the next iteration
            motif_matrix.append(next_motif)
            counts = motif_matrix_count(motif_matrix)
            profile = motif_matrix_profile(counts)

        if score_motif(motif_matrix) < score_motif(best_motif_matrix):
            best_motif_matrix = motif_matrix

    return best_motif_matrix
Пример #12
0
# GGGGACTTCTGTCCCTAGCC
# TGGGACTTTCGGCCCTGTCC
# GGGGACCAACGCCCCTGGGA
# GGGGACCGAAGTCCCCGGGC
# 11
# consensus_kmer = 'CGGGACCTACGTCCCTAGCC'  # this is consensus string for hte matrix it finds

best_motif_matrix_counts = motif_matrix_count(best_motif_matrix)
for elem, counts in best_motif_matrix_counts.items():  # add in pseudocounts
    best_motif_matrix_counts[elem] = [c + 1 for c in counts]
best_motif_matrix_profile = motif_matrix_profile(best_motif_matrix_counts)

with open('/home/user/Downloads/GCF_000195955.2_ASM19595v2_genomic.fna', mode='r', encoding='utf-8') as f:
    data = f.read()
lines = data.split('\n')
lines = [l.strip() for l in lines]  # get rid of whitespace
lines = [l if not l.startswith('>') else '' for l in lines]  # remove comments
dna = ''.join(lines)  # concat into single dna str
for kmer, _ in slide_window(dna, k):
    prob = determine_probability_of_match_using_profile_matrix(best_motif_matrix_profile, kmer)
    if prob >= 0.01:  # 1% or greater
        print(f'{kmer} {prob}')

# Nothing is found...
#
# The strings in DosR.txt aren't matching up to the genome at the link I posted (even though the name of the organism
# matches). I'm guessing it's a different variant of the organism that was studied in the original 2003 paper. Maybe
# this variant uses a different motif or doesn't have it (doesn't have the ability to lie dormant like the organism that
# the original paper studied).

Пример #13
0
from Read import Read
from ToDeBruijnGraph import to_debruijn_graph
from Utils import slide_window

with open('/home/user/Downloads/dataset_240257_6(1).txt',
          mode='r',
          encoding='utf-8') as f:
    data = f.read()

lines = data.split('\n')
k = int(lines[0].strip())
dna = lines[1].strip()

reads = [Read(kmer) for kmer, _ in slide_window(dna, k)]

graph = to_debruijn_graph(reads)
for node, other_nodes in graph.get_all_outputs():
    other_nodes = list(other_nodes)
    if len(other_nodes) == 0:
        continue
    print(f'{node} -> {",".join([str(x) for x in other_nodes])}')
Пример #14
0
 def shatter(self: Read, k: int) -> List[Read]:
     ret = []
     for kmer, _ in slide_window(self.data, k):
         r = Read(kmer, source=('shatter', [self]))
         ret.append(r)
     return ret