def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, penalize_extend_when_opening, penalize_end_gaps, align_globally, gap_char, force_generic, score_only, one_alignment_only): if not sequenceA or not sequenceB: return [] if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \ and isinstance(gap_B_fn, affine_penalty): open_A, extend_A = gap_A_fn.open, gap_A_fn.extend open_B, extend_B = gap_B_fn.open, gap_B_fn.extend x = _make_score_matrix_fast( sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, penalize_extend_when_opening, penalize_end_gaps, align_globally, score_only) else: x = _make_score_matrix_generic( sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, penalize_extend_when_opening, penalize_end_gaps, align_globally, score_only) score_matrix, trace_matrix = x #print "SCORE"; print_matrix(score_matrix) #print "TRACEBACK"; print_matrix(trace_matrix) # Look for the proper starting point. Get a list of all possible # starting points. starts = _find_start( score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally) # Find the highest score. best_score = max([x[0] for x in starts]) # If they only want the score, then return it. if score_only: return best_score tolerance = 0 # XXX do anything with this? # Now find all the positions within some tolerance of the best # score. i = 0 while i < len(starts): score, pos = starts[i] if rint(abs(score-best_score)) > rint(tolerance): del starts[i] else: i += 1 # Recover the alignments and return them. x = _recover_alignments( sequenceA, sequenceB, starts, score_matrix, trace_matrix, align_globally, penalize_end_gaps, gap_char, one_alignment_only) return x
def _make_score_matrix_fast( sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, penalize_extend_when_opening, penalize_end_gaps, align_globally, score_only): first_A_gap = calc_affine_penalty(1, open_A, extend_A, penalize_extend_when_opening) first_B_gap = calc_affine_penalty(1, open_B, extend_B, penalize_extend_when_opening) # Create the score and traceback matrices. These should be in the # shape: # sequenceA (down) x sequenceB (across) lenA, lenB = len(sequenceA), len(sequenceB) score_matrix, trace_matrix = [], [] for i in range(lenA): score_matrix.append([None] * lenB) trace_matrix.append([[None]] * lenB) # The top and left borders of the matrices are special cases # because there are no previously aligned characters. To simplify # the main loop, handle these separately. for i in range(lenA): # Align the first residue in sequenceB to the ith residue in # sequence A. This is like opening up i gaps at the beginning # of sequence B. score = match_fn(sequenceA[i], sequenceB[0]) if penalize_end_gaps: score += calc_affine_penalty( i, open_B, extend_B, penalize_extend_when_opening) score_matrix[i][0] = score for i in range(1, lenB): score = match_fn(sequenceA[0], sequenceB[i]) if penalize_end_gaps: score += calc_affine_penalty( i, open_A, extend_A, penalize_extend_when_opening) score_matrix[0][i] = score # In the generic algorithm, at each row and column in the score # matrix, we had to scan all previous rows and columns to see # whether opening a gap might yield a higher score. Here, since # we know the penalties are affine, we can cache just the best # score in the previous rows and columns. Instead of scanning # through all the previous rows and cols, we can just look at the # cache for the best one. Whenever the row or col increments, the # best cached score just decreases by extending the gap longer. # The best score and indexes for each row (goes down all columns). # I don't need to store the last row because it's the end of the # sequence. row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1) # The best score and indexes for each column (goes across rows). col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1) for i in range(lenA-1): # Initialize each row to be the alignment of sequenceA[i] to # sequenceB[0], plus opening a gap in sequenceA. row_cache_score[i] = score_matrix[i][0] + first_A_gap row_cache_index[i] = [(i, 0)] for i in range(lenB-1): col_cache_score[i] = score_matrix[0][i] + first_B_gap col_cache_index[i] = [(0, i)] # Fill in the score_matrix. for row in range(1, lenA): for col in range(1, lenB): # Calculate the score that would occur by extending the # alignment without gaps. nogap_score = score_matrix[row-1][col-1] # Check the score that would occur if there were a gap in # sequence A. if col > 1: row_score = row_cache_score[row-1] else: row_score = nogap_score - 1 # Make sure it's not the best. # Check the score that would occur if there were a gap in # sequence B. if row > 1: col_score = col_cache_score[col-1] else: col_score = nogap_score - 1 best_score = max(nogap_score, row_score, col_score) best_score_rint = rint(best_score) best_index = [] if best_score_rint == rint(nogap_score): best_index.append((row-1, col-1)) if best_score_rint == rint(row_score): best_index.extend(row_cache_index[row-1]) if best_score_rint == rint(col_score): best_index.extend(col_cache_index[col-1]) # Set the score and traceback matrices. score = best_score + match_fn(sequenceA[row], sequenceB[col]) if not align_globally and score < 0: score_matrix[row][col] = 0 else: score_matrix[row][col] = score trace_matrix[row][col] = best_index # Update the cached column scores. The best score for # this can come from either extending the gap in the # previous cached score, or opening a new gap from the # most previously seen character. Compare the two scores # and keep the best one. open_score = score_matrix[row-1][col-1] + first_B_gap extend_score = col_cache_score[col-1] + extend_B open_score_rint, extend_score_rint = \ rint(open_score), rint(extend_score) if open_score_rint > extend_score_rint: col_cache_score[col-1] = open_score col_cache_index[col-1] = [(row-1, col-1)] elif extend_score_rint > open_score_rint: col_cache_score[col-1] = extend_score else: col_cache_score[col-1] = open_score if (row-1, col-1) not in col_cache_index[col-1]: col_cache_index[col-1] = col_cache_index[col-1] + \ [(row-1, col-1)] # Update the cached row scores. open_score = score_matrix[row-1][col-1] + first_A_gap extend_score = row_cache_score[row-1] + extend_A open_score_rint, extend_score_rint = \ rint(open_score), rint(extend_score) if open_score_rint > extend_score_rint: row_cache_score[row-1] = open_score row_cache_index[row-1] = [(row-1, col-1)] elif extend_score_rint > open_score_rint: row_cache_score[row-1] = extend_score else: row_cache_score[row-1] = open_score if (row-1, col-1) not in row_cache_index[row-1]: row_cache_index[row-1] = row_cache_index[row-1] + \ [(row-1, col-1)] return score_matrix, trace_matrix
def _make_score_matrix_generic( sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, penalize_extend_when_opening, penalize_end_gaps, align_globally, score_only): # This is an implementation of the Needleman-Wunsch dynamic # programming algorithm for aligning sequences. # Create the score and traceback matrices. These should be in the # shape: # sequenceA (down) x sequenceB (across) lenA, lenB = len(sequenceA), len(sequenceB) score_matrix, trace_matrix = [], [] for i in range(lenA): score_matrix.append([None] * lenB) trace_matrix.append([[None]] * lenB) # The top and left borders of the matrices are special cases # because there are no previously aligned characters. To simplify # the main loop, handle these separately. for i in range(lenA): # Align the first residue in sequenceB to the ith residue in # sequence A. This is like opening up i gaps at the beginning # of sequence B. score = match_fn(sequenceA[i], sequenceB[0]) if penalize_end_gaps: score += gap_B_fn(0, i) score_matrix[i][0] = score for i in range(1, lenB): score = match_fn(sequenceA[0], sequenceB[i]) if penalize_end_gaps: score += gap_A_fn(0, i) score_matrix[0][i] = score # Fill in the score matrix. Each position in the matrix # represents an alignment between a character from sequenceA to # one in sequence B. As I iterate through the matrix, find the # alignment by choose the best of: # 1) extending a previous alignment without gaps # 2) adding a gap in sequenceA # 3) adding a gap in sequenceB for row in range(1, lenA): for col in range(1, lenB): # First, calculate the score that would occur by extending # the alignment without gaps. best_score = score_matrix[row-1][col-1] best_score_rint = rint(best_score) best_indexes = [(row-1, col-1)] # Try to find a better score by opening gaps in sequenceA. # Do this by checking alignments from each column in the # previous row. Each column represents a different # character to align from, and thus a different length # gap. for i in range(0, col-1): score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i) score_rint = rint(score) if score_rint == best_score_rint: best_score, best_score_rint = score, score_rint best_indexes.append((row-1, i)) elif score_rint > best_score_rint: best_score, best_score_rint = score, score_rint best_indexes = [(row-1, i)] # Try to find a better score by opening gaps in sequenceB. for i in range(0, row-1): score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i) score_rint = rint(score) if score_rint == best_score_rint: best_score, best_score_rint = score, score_rint best_indexes.append((i, col-1)) elif score_rint > best_score_rint: best_score, best_score_rint = score, score_rint best_indexes = [(i, col-1)] score_matrix[row][col] = best_score + \ match_fn(sequenceA[row], sequenceB[col]) if not align_globally and score_matrix[row][col] < 0: score_matrix[row][col] = 0 trace_matrix[row][col] = best_indexes return score_matrix, trace_matrix