Пример #1
0
def k_mer_generator(k):
    """returns a set of all 4^k k-mers"""

    k_mer_list = []
    for i in range(4**k - 1):
        k_mer_list.append(ntp.number_to_pattern(i, k))
    return set(k_mer_list)
Пример #2
0
def freq_seq_mismatch_sort(genome, k, d):
    frequent_patterns = set()
    neighborhoods = []
    count = [0]
    pos_index = [0]

    for i in range(len(genome) - k):
        neighborhoods.append(neighbors.recursive_neighbors(genome[i:i + k], d))

    flattened_neighborhood = [i for i in apc.iter_flatten(neighborhoods)]

    for i in range(len(flattened_neighborhood)):
        pattern = flattened_neighborhood[i]
        pos_index.append(ptn.pattern_to_number(pattern))
        count.append(1)

    sorted_index = sorted(pos_index)

    for i in range(len(flattened_neighborhood)):
        if sorted_index[i] == sorted_index[i + 1]:
            count[i + 1] = count[i] + 1

    max_count = max(count)

    for i in range(len(flattened_neighborhood)):
        if count[i] == max_count:
            pattern = ntp.number_to_pattern(sorted_index[i], k)
            frequent_patterns.add(pattern)

    return frequent_patterns
Пример #3
0
def all_kmer_generator(KMER_LENGTH):
    """ Generates a set of all possible kmers given the input KMER_LENGTH """

    kmer_set = set()

    for i in range(4 ** KMER_LENGTH - 1):
        kmer_set.add(ntp.number_to_pattern(i, KMER_LENGTH))

    return kmer_set
Пример #4
0
def freq_words_w_mismatches(text, k, d):
    freq_patterns = set()

    freq_patterns_w_reversecomp = []

    close_array = [0]
    freq_array = [0]

    neighborhood = []

    for i in range(4**k - 1):
        close_array.append(0)
        freq_array.append(0)

    for i in range(len(text) - k):
        neighborhood.append(neighbors.recursive_neighbors(text[i:i+k], d))

    flat_neighborhood = [i for i in apc.iter_flatten(neighborhood)]

    for pattern in flat_neighborhood:
        the_index = ptn.pattern_to_number(pattern)
        close_array[the_index] = 1


    for i in range(4**k - 1):
        if close_array[i] == 1:
            generated_pattern = ntp.number_to_pattern(i, k)
            freq_array[i] = apc.approx_pattern_count(text, generated_pattern, d)
            rc_pattern = rc.reverse_complement(generated_pattern)
            freq_patterns_w_reversecomp.append(rc_pattern)

    the_max_count = max(freq_array)

    for i in range(4**k -  1):
        if freq_array[i] == the_max_count:
            generated_kmer = ntp.number_to_pattern(i, k)
            freq_patterns.add(generated_kmer)

    return freq_patterns
Пример #5
0
def median_string(dna_list, k):
    """generates the median string that has the minimum distance"""

    median_list = []

    distance = k * (len(dna_list) - 1)

    for i in range(4**k - 1):
        k_mer = ntp.number_to_pattern(i, k)
        distance_pattern = distance_pattern_string(k_mer, dna_list)

        if distance > distance_pattern:
            distance = distance_pattern
            median = k_mer
            median_list = []

        if distance == distance_pattern:
            median_list.append(k_mer)

    return median_list