예제 #1
0
def get_score_diff(matrix):
    """
	get score on starting alignments using initial matrix and optimized matrix
	"""
    pos_start = read_existing_aligns("BLOSUM50_pos_aligns.txt")
    neg_start = read_existing_aligns('BLOSUM50_neg_aligns.txt')

    #matrices
    start_mat = algs.get_scoring_matrix(matrix)
    op_mat = algs.get_scoring_matrix("optimized_" + matrix)

    #initial scores
    tp_b50 = [algs.score_existing_align(*(p), start_mat) for p in pos_start]
    fp_b50 = [algs.score_existing_align(*(n), start_mat) for n in neg_start]

    #optimized scores
    tp_b50o = [algs.score_existing_align(*(p), op_mat) for p in pos_start]
    fp_b50o = [algs.score_existing_align(*(n), op_mat) for n in neg_start]

    all_b50 = tp_b50 + fp_b50
    all_b50o = tp_b50o + fp_b50o

    #calculate and print differences
    total_diff = sum([x[0] - x[1]
                      for x in zip(all_b50o, all_b50)]) / len(all_b50)
    pos_diff = sum([x[0] - x[1] for x in zip(tp_b50o, tp_b50)]) / len(tp_b50)
    neg_diff = sum([x[0] - x[1] for x in zip(fp_b50o, fp_b50)]) / len(fp_b50)

    print(total_diff, pos_diff, neg_diff)
예제 #2
0
def test_score_align():
	"""
	Score existing alignment
	"""
	blosum50 = algs.get_scoring_matrix("BLOSUM50") 
	assert(algs.score_existing_align("AAAAAAA", "AAAAAAA", blosum50) == 35)
	assert(algs.score_existing_align("AA-AAAA", "AAAAAAA", blosum50) == 19)
	assert(algs.score_existing_align("AA--AAA", "AAAAAAA", blosum50) == 13)
예제 #3
0
def optimize_score_matrix(pos, neg, starting_matrix, goal=None, max_gen=100):
    """
	Genetic, starting from https://towardsdatascience.com/introduction-to-genetic-algorithms-
	including-example-code-e396e98d8bf3
	"""
    starting_mat = algs.get_scoring_matrix(starting_matrix)

    #calculate the starting score
    true_scores = [algs.score_existing_align(*(p), starting_mat) for p in pos]
    false_scores = [algs.score_existing_align(*(n), starting_mat) for n in neg]

    scores = [loss_function(true_scores, false_scores)]

    #if we dont have a specific goal, beat the starting matrix
    if goal is None:
        goal = scores[0]
    print("Starting score:", scores[0])

    population = [starting_mat]

    #generate 3 random matrices for a total population of 4
    for i in range(3):
        population.append(mutate_matrix(population[i], 0.5))
        scores.append(-1)

    generation = 0
    best_matrix = None
    matrix_filename = None
    while best_matrix is None:
        #get current generation and scores
        population, scores = genetic_loop(pos, neg, population, scores,
                                          generation, goal)
        #check if we found the best
        if len(population) == 1:
            #save matrix and return
            i, score = scores
            print("Optimized: {0}_{1}, score: {2}".format(
                generation, i, score))
            best_matrix = population[0]
            create_matrix_file(best_matrix, "optimized_" + starting_matrix)
            matrix_filename = "optimized_" + starting_matrix
        #hit max without hitting goal
        if generation == max_gen:
            #get the best matrix in the current population
            best_score = -1
            for individual, score in zip(population, scores):
                if score > best_score:
                    best_score = score
                    best_matrix = individual
            create_matrix_file(best_matrix, "optimized_" + starting_matrix)
            matrix_filename = "optimized_" + starting_matrix
        generation += 1

    return best_matrix, matrix_filename
예제 #4
0
def test_read_matrix():
	"""
	Make sure scoring matrices are being read correctly
	"""
	blosum50 = algs.get_scoring_matrix("BLOSUM50")
	assert(blosum50['A']['A'] == 5)
	assert(blosum50['F']['A'] == -3)
	assert(blosum50['A']['F'] == -3)

	blosum62 = sw.algs.get_scoring_matrix("BLOSUM62")
	assert(blosum62['A']['A'] == 4)
	assert(blosum62['F']['A'] == -2)
	assert(blosum62['A']['F'] == -2)

	matio = sw.algs.get_scoring_matrix("MATIO")
	assert(matio['A']['A'] == 0)
	assert(matio['F']['A'] == 2)
	assert(matio['A']['F'] == 2)
예제 #5
0
def calc_all_scores(pairs, matrix, gap_start, gap_extend):
    """
	Calculate alignment score for all pairs in pairs
	"""
    scores = []

    #speedup - read in score matrix here if its a string
    if isinstance(matrix, str):
        matrix = algs.get_scoring_matrix(matrix)

    #calculate true and false scores with specified open/extend
    for pair in pairs:
        score = algs.score(*(pair),
                           score_matrix=matrix,
                           gap_start=gap_start,
                           gap_extend=gap_extend)
        scores.append((pair, score))
    return scores
예제 #6
0
def calc_all_aligns(pairs, matrix, gap_start, gap_extend, filename):
    """
	Calculate and save alignments and score for all pairs in pairs
	"""
    aligned = []

    #speedup - read in score matrix here if its a string
    if isinstance(matrix, str):
        matrix = algs.get_scoring_matrix(matrix)

    #calculate true and false scores with specified open/extend
    for pair in pairs:
        results = algs.align(*(pair),
                             score_matrix=matrix,
                             gap_start=gap_start,
                             gap_extend=gap_extend)
        aligned.append((results[1], results[2], results[0]))
    with open(filename, 'w') as f:
        for seq1, seq2, score in aligned:
            f.write(seq1 + '\n')
            f.write(seq2 + '\n')
            f.write(str(score) + '\n')
            f.write('\n')
예제 #7
0
def full_optimization_run(starting_pos,
                          starting_neg,
                          starting_matrix,
                          goal=4,
                          max_gen=5000):

    #optimize starting
    pos_aligns = read_existing_aligns(starting_pos)
    neg_aligns = read_existing_aligns(starting_neg)

    #get best matrix and filename
    new_matrix, matrix_filename = optimize_score_matrix(pos_aligns,
                                                        neg_aligns,
                                                        starting_matrix,
                                                        goal=goal,
                                                        max_gen=max_gen)

    #generate scores in initial alignment
    starting_mat = algs.get_scoring_matrix(starting_matrix)

    #calculate the starting score
    true_scores = [
        algs.score_existing_align(*(p), starting_mat) for p in pos_aligns
    ]
    false_scores = [
        algs.score_existing_align(*(n), starting_mat) for n in neg_aligns
    ]

    #calculate score on previous alignment
    precal_pos = [
        algs.score_existing_align(*(p), new_matrix) for p in pos_aligns
    ]
    precal_neg = [
        algs.score_existing_align(*(n), new_matrix) for n in neg_aligns
    ]

    #read in initial pairs for realign
    pos = read_pairs("Pospairs.txt")
    neg = read_pairs("Negpairs.txt")

    #save new aligns
    calc_all_aligns(pos, new_matrix, 11, 1,
                    starting_matrix + "_optimized_pos_aligns.txt")
    calc_all_aligns(neg, new_matrix, 11, 1,
                    starting_matrix + "_optimized_neg_aligns.txt")

    #read in and score new aligns for consistency in scoring
    pos_aligns_2 = read_existing_aligns(starting_matrix +
                                        "_optimized_pos_aligns.txt")
    neg_aligns_2 = read_existing_aligns(starting_matrix +
                                        "_optimized_neg_aligns.txt")

    precal_pos_2 = [
        algs.score_existing_align(*(p), new_matrix) for p in pos_aligns_2
    ]
    precal_neg_2 = [
        algs.score_existing_align(*(n), new_matrix) for n in neg_aligns_2
    ]

    prescored = [(0, (true_scores, false_scores)),
                 (1, (precal_pos, precal_neg)), (2, ())]

    TSs = [true_scores, precal_pos, precal_pos_2]
    FSs = [false_scores, precal_neg, precal_neg_2]

    #plot all new scores
    algs.roc(TSs,
             FSs, [
                 starting_matrix, starting_matrix + "_optimized",
                 starting_matrix + "_optimized_realign"
             ],
             save=True,
             filename=starting_matrix + "_optimization.png")