item[1] ] not in superstrings: superstrings.append(inner_item + [item[1]]) if len(inner_item) == len(fasta_dict) - 1: return superstrings def build_superstring(fasta_dict, keys_list): overlap_len = len(fasta_dict[keys_list[0]]) // 2 superstring = fasta_dict[keys_list[0]] for index in range(1, len(keys_list)): new_string = fasta_dict[keys_list[index]] overlap_start = common.substring_positions( new_string, superstring[-overlap_len:])[0] superstring += new_string[overlap_start + overlap_len:] return superstring fasta_dict = common.fasta_parser('LONG.txt') adj_list = list() for key in fasta_dict.keys(): for inner_key in fasta_dict.keys(): if common.overlapping(fasta_dict[key], fasta_dict[inner_key]) and key != inner_key: adj_list.append((key, inner_key)) print(len(adj_list)) longest_path = sorted(get_superstrings(adj_list), key=len)[-1] print(longest_path) print(build_superstring(fasta_dict, longest_path))
import common import math ''' Assuming we have the same amount of occurences of A-U and G-C, all possible perfect matches is simply (AU)!*(GC)! ! - cause of number of permutations of size AU from AU list * - cause of independent states of this pairs ''' fasta_dict = common.fasta_parser('PMCH.txt') sequence = fasta_dict[list(fasta_dict.keys())[0]] AU = 0 GC = 0 AU, GC = (sequence.count(item) for item in ('A', 'G')) matchings = math.factorial(AU) * math.factorial(GC) print(matchings)
import common fdict = common.fasta_parser('GC.txt') gc_count = dict() max_gc = 0 max_key = '' for key in fdict.keys(): gc_count[key] = round((fdict[key].count('G') + fdict[key].count('C')) * 100 / len(fdict[key]), 6) if gc_count[key] > max_gc: max_gc = gc_count[key] max_key = key print(max_key) print(max_gc)
import common fasta_dict = common.fasta_parser('ORF.txt') sequence = fasta_dict[list(fasta_dict.keys())[0]] forward_orf = [ common.translate_dna(orf) for orf in common.find_orf_dna(sequence) ] reverse_orf = [ common.translate_dna(orf) for orf in common.find_orf_dna(common.reverse_comp(sequence)) ] print(forward_orf) print(reverse_orf) with open('ORF_result.txt', 'w') as f: [print(orf, file=f) for orf in set(forward_orf) | set(reverse_orf)]
import common adj_len = 3 fasta_dict = common.fasta_parser('GRPH.txt') adj_list = [] for key in fasta_dict.keys(): for inner_key in fasta_dict.keys(): if fasta_dict[key][-adj_len:] == fasta_dict[ inner_key][:adj_len] and key != inner_key: adj_list.append((key, inner_key)) with open('GRPH_result.txt', 'w') as output_file: for adj in adj_list: print(' '.join(map(str, [item for item in adj])), file=output_file)
import common fasta_dict = common.fasta_parser('SSEQ.txt') ref_seq = fasta_dict[list(fasta_dict.keys())[0]] search_seq = fasta_dict[list(fasta_dict.keys())[1]] print(ref_seq, search_seq) indices_list = [common.substring_positions(ref_seq, letter) for letter in search_seq] #print(indices_list) spliced_motif = list() for item in indices_list: if spliced_motif == []: spliced_motif.append(item[0]) continue for index in item: if spliced_motif[0] == index: continue elif index > spliced_motif[-1]: spliced_motif.append(index) break print(' '.join(map(lambda x: str(x+1), spliced_motif)))
import common def long_substr(data): substr = '' if len(data) > 1 and len(data[0]) > 0: for i in range(len(data[0])): for j in range(len(data[0]) - i + 1): if j > len(substr) and all(data[0][i:i + j] in x for x in data): substr = data[0][i:i + j] return substr fasta_dict = common.fasta_parser('LCSM.txt') data = [fasta_dict[key] for key in fasta_dict.keys()] print(long_substr(data))
import numpy as np from level4.HAMM import hamming_distance import common def get_dist_table(fasta_dict): fasta_list = [fasta_dict[item] for item in fasta_dict.keys()] d = np.zeros((len(fasta_list), len(fasta_list))) for i in range(len(fasta_list)): for j in range(len(fasta_list)): d[i][j] = round( hamming_distance(fasta_list[i], fasta_list[j]) / len(fasta_list[i]), 4) return d if __name__ == '__main__': input_fasta_dict = common.fasta_parser('PDST.txt') dist_matrix = get_dist_table(input_fasta_dict) with open('PDST_result.txt', 'w') as outfile: for i in dist_matrix: print(' '.join(map(str, i)), file=outfile)
import common import numpy as np fasta_dict = common.fasta_parser('CONS.txt') letter_dict = dict() base_list = ['A', 'C', 'G', 'T'] for base in base_list: letter_dict[base] = [0] * len(fasta_dict[list(fasta_dict.keys())[0]]) for key in fasta_dict.keys(): count = 0 for letter in fasta_dict[key]: letter_dict[letter][count] += 1 count += 1 cons_seq = '' counter = 0 for _ in range(len(fasta_dict[list(fasta_dict.keys())[0]])): value_list = [letter_dict[base][counter] for base in letter_dict.keys()] cons_seq += base_list[np.argmax(value_list)] counter += 1 print(cons_seq) with open('CONS_result.txt', 'w') as f: print(cons_seq, file=f) for key in letter_dict.keys(): print('{}: {}'.format(key, ' '.join(map(str, [item for item in letter_dict[key]]))), file=f)
import common fasta_dict = common.fasta_parser('SPLC.txt') raw_seq = fasta_dict[list(fasta_dict.keys())[0]] for key in fasta_dict.keys(): if fasta_dict[key] != raw_seq: start_position = raw_seq.index(fasta_dict[key]) raw_seq = raw_seq[:start_position] + raw_seq[start_position + len(fasta_dict[key]):] print(common.translate_rna(raw_seq.replace('T', 'U')))
import common fasta_dict = common.fasta_parser('TRAN.txt') seq1, seq2 = (fasta_dict[key] for key in fasta_dict.keys()) transitions = [set(['A', 'G']), set(['C', 'T'])] transitions_num, transfections_num = 0, 0 for index in range(len(seq1)): if seq1[index] != seq2[index]: if set([seq1[index], seq2[index]]) in transitions: transitions_num += 1 else: transfections_num += 1 print(round(transitions_num / transfections_num, 3))
import common fasta_dict = common.fasta_parser('REVP.txt') sequence = fasta_dict[list(fasta_dict.keys())[0]] output_file = open('REVP_result.txt', 'w') for half_pal_len in [2, 3, 4, 5, 6]: for position in range(len(sequence) - half_pal_len): if sequence[position:position + half_pal_len] == common.reverse_comp( sequence[position + half_pal_len:position + 2 * half_pal_len]): print(position + 1, 2 * half_pal_len, file=output_file) output_file.close()
import common kmer_len = 4 def get_kmer_count(string, kmer): return len(common.substring_positions(string, kmer)) fasta_dict = common.fasta_parser('KMER.txt') sequence = fasta_dict[list(fasta_dict.keys())[0]] all_kmers = common.get_kmers(["A", "T", "G", "C"], kmer_len) print(' '.join(map(str, [get_kmer_count(sequence, kmer) for kmer in all_kmers])))