def get_score_diff(matrix): """ get score on starting alignments using initial matrix and optimized matrix """ pos_start = read_existing_aligns("BLOSUM50_pos_aligns.txt") neg_start = read_existing_aligns('BLOSUM50_neg_aligns.txt') #matrices start_mat = algs.get_scoring_matrix(matrix) op_mat = algs.get_scoring_matrix("optimized_" + matrix) #initial scores tp_b50 = [algs.score_existing_align(*(p), start_mat) for p in pos_start] fp_b50 = [algs.score_existing_align(*(n), start_mat) for n in neg_start] #optimized scores tp_b50o = [algs.score_existing_align(*(p), op_mat) for p in pos_start] fp_b50o = [algs.score_existing_align(*(n), op_mat) for n in neg_start] all_b50 = tp_b50 + fp_b50 all_b50o = tp_b50o + fp_b50o #calculate and print differences total_diff = sum([x[0] - x[1] for x in zip(all_b50o, all_b50)]) / len(all_b50) pos_diff = sum([x[0] - x[1] for x in zip(tp_b50o, tp_b50)]) / len(tp_b50) neg_diff = sum([x[0] - x[1] for x in zip(fp_b50o, fp_b50)]) / len(fp_b50) print(total_diff, pos_diff, neg_diff)
def test_score_align(): """ Score existing alignment """ blosum50 = algs.get_scoring_matrix("BLOSUM50") assert(algs.score_existing_align("AAAAAAA", "AAAAAAA", blosum50) == 35) assert(algs.score_existing_align("AA-AAAA", "AAAAAAA", blosum50) == 19) assert(algs.score_existing_align("AA--AAA", "AAAAAAA", blosum50) == 13)
def optimize_score_matrix(pos, neg, starting_matrix, goal=None, max_gen=100): """ Genetic, starting from https://towardsdatascience.com/introduction-to-genetic-algorithms- including-example-code-e396e98d8bf3 """ starting_mat = algs.get_scoring_matrix(starting_matrix) #calculate the starting score true_scores = [algs.score_existing_align(*(p), starting_mat) for p in pos] false_scores = [algs.score_existing_align(*(n), starting_mat) for n in neg] scores = [loss_function(true_scores, false_scores)] #if we dont have a specific goal, beat the starting matrix if goal is None: goal = scores[0] print("Starting score:", scores[0]) population = [starting_mat] #generate 3 random matrices for a total population of 4 for i in range(3): population.append(mutate_matrix(population[i], 0.5)) scores.append(-1) generation = 0 best_matrix = None matrix_filename = None while best_matrix is None: #get current generation and scores population, scores = genetic_loop(pos, neg, population, scores, generation, goal) #check if we found the best if len(population) == 1: #save matrix and return i, score = scores print("Optimized: {0}_{1}, score: {2}".format( generation, i, score)) best_matrix = population[0] create_matrix_file(best_matrix, "optimized_" + starting_matrix) matrix_filename = "optimized_" + starting_matrix #hit max without hitting goal if generation == max_gen: #get the best matrix in the current population best_score = -1 for individual, score in zip(population, scores): if score > best_score: best_score = score best_matrix = individual create_matrix_file(best_matrix, "optimized_" + starting_matrix) matrix_filename = "optimized_" + starting_matrix generation += 1 return best_matrix, matrix_filename
def test_read_matrix(): """ Make sure scoring matrices are being read correctly """ blosum50 = algs.get_scoring_matrix("BLOSUM50") assert(blosum50['A']['A'] == 5) assert(blosum50['F']['A'] == -3) assert(blosum50['A']['F'] == -3) blosum62 = sw.algs.get_scoring_matrix("BLOSUM62") assert(blosum62['A']['A'] == 4) assert(blosum62['F']['A'] == -2) assert(blosum62['A']['F'] == -2) matio = sw.algs.get_scoring_matrix("MATIO") assert(matio['A']['A'] == 0) assert(matio['F']['A'] == 2) assert(matio['A']['F'] == 2)
def calc_all_scores(pairs, matrix, gap_start, gap_extend): """ Calculate alignment score for all pairs in pairs """ scores = [] #speedup - read in score matrix here if its a string if isinstance(matrix, str): matrix = algs.get_scoring_matrix(matrix) #calculate true and false scores with specified open/extend for pair in pairs: score = algs.score(*(pair), score_matrix=matrix, gap_start=gap_start, gap_extend=gap_extend) scores.append((pair, score)) return scores
def calc_all_aligns(pairs, matrix, gap_start, gap_extend, filename): """ Calculate and save alignments and score for all pairs in pairs """ aligned = [] #speedup - read in score matrix here if its a string if isinstance(matrix, str): matrix = algs.get_scoring_matrix(matrix) #calculate true and false scores with specified open/extend for pair in pairs: results = algs.align(*(pair), score_matrix=matrix, gap_start=gap_start, gap_extend=gap_extend) aligned.append((results[1], results[2], results[0])) with open(filename, 'w') as f: for seq1, seq2, score in aligned: f.write(seq1 + '\n') f.write(seq2 + '\n') f.write(str(score) + '\n') f.write('\n')
def full_optimization_run(starting_pos, starting_neg, starting_matrix, goal=4, max_gen=5000): #optimize starting pos_aligns = read_existing_aligns(starting_pos) neg_aligns = read_existing_aligns(starting_neg) #get best matrix and filename new_matrix, matrix_filename = optimize_score_matrix(pos_aligns, neg_aligns, starting_matrix, goal=goal, max_gen=max_gen) #generate scores in initial alignment starting_mat = algs.get_scoring_matrix(starting_matrix) #calculate the starting score true_scores = [ algs.score_existing_align(*(p), starting_mat) for p in pos_aligns ] false_scores = [ algs.score_existing_align(*(n), starting_mat) for n in neg_aligns ] #calculate score on previous alignment precal_pos = [ algs.score_existing_align(*(p), new_matrix) for p in pos_aligns ] precal_neg = [ algs.score_existing_align(*(n), new_matrix) for n in neg_aligns ] #read in initial pairs for realign pos = read_pairs("Pospairs.txt") neg = read_pairs("Negpairs.txt") #save new aligns calc_all_aligns(pos, new_matrix, 11, 1, starting_matrix + "_optimized_pos_aligns.txt") calc_all_aligns(neg, new_matrix, 11, 1, starting_matrix + "_optimized_neg_aligns.txt") #read in and score new aligns for consistency in scoring pos_aligns_2 = read_existing_aligns(starting_matrix + "_optimized_pos_aligns.txt") neg_aligns_2 = read_existing_aligns(starting_matrix + "_optimized_neg_aligns.txt") precal_pos_2 = [ algs.score_existing_align(*(p), new_matrix) for p in pos_aligns_2 ] precal_neg_2 = [ algs.score_existing_align(*(n), new_matrix) for n in neg_aligns_2 ] prescored = [(0, (true_scores, false_scores)), (1, (precal_pos, precal_neg)), (2, ())] TSs = [true_scores, precal_pos, precal_pos_2] FSs = [false_scores, precal_neg, precal_neg_2] #plot all new scores algs.roc(TSs, FSs, [ starting_matrix, starting_matrix + "_optimized", starting_matrix + "_optimized_realign" ], save=True, filename=starting_matrix + "_optimization.png")