Exemplo n.º 1
0
def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
           penalize_extend_when_opening, penalize_end_gaps,
           align_globally, gap_char, force_generic, score_only,
           one_alignment_only):
    if not sequenceA or not sequenceB:
        return []

    if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \
    and isinstance(gap_B_fn, affine_penalty):
        open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
        open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
        x = _make_score_matrix_fast(
            sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
            penalize_extend_when_opening, penalize_end_gaps, align_globally,
            score_only)
    else:
        x = _make_score_matrix_generic(
            sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
            penalize_extend_when_opening, penalize_end_gaps, align_globally,
            score_only)
    score_matrix, trace_matrix = x

    #print "SCORE"; print_matrix(score_matrix)
    #print "TRACEBACK"; print_matrix(trace_matrix)
         
    # Look for the proper starting point.  Get a list of all possible
    # starting points.
    starts = _find_start(
        score_matrix, sequenceA, sequenceB,
        gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally)
    # Find the highest score.
    best_score = max([x[0] for x in starts])

    # If they only want the score, then return it.
    if score_only:
        return best_score
    
    tolerance = 0  # XXX do anything with this?
    # Now find all the positions within some tolerance of the best
    # score.
    i = 0
    while i < len(starts):
        score, pos = starts[i]
        if rint(abs(score-best_score)) > rint(tolerance):
            del starts[i]
        else:
            i += 1
    
    # Recover the alignments and return them.
    x = _recover_alignments(
        sequenceA, sequenceB, starts, score_matrix, trace_matrix,
        align_globally, penalize_end_gaps, gap_char, one_alignment_only)
    return x
Exemplo n.º 2
0
def _make_score_matrix_fast(
    sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
    penalize_extend_when_opening, penalize_end_gaps,
    align_globally, score_only):
    first_A_gap = calc_affine_penalty(1, open_A, extend_A,
                                      penalize_extend_when_opening)
    first_B_gap = calc_affine_penalty(1, open_B, extend_B,
                                      penalize_extend_when_opening)

    # Create the score and traceback matrices.  These should be in the
    # shape:
    # sequenceA (down) x sequenceB (across)
    lenA, lenB = len(sequenceA), len(sequenceB)
    score_matrix, trace_matrix = [], []
    for i in range(lenA):
        score_matrix.append([None] * lenB)
        trace_matrix.append([[None]] * lenB)

    # The top and left borders of the matrices are special cases
    # because there are no previously aligned characters.  To simplify
    # the main loop, handle these separately.
    for i in range(lenA):
        # Align the first residue in sequenceB to the ith residue in
        # sequence A.  This is like opening up i gaps at the beginning
        # of sequence B.
        score = match_fn(sequenceA[i], sequenceB[0])
        if penalize_end_gaps:
            score += calc_affine_penalty(
                i, open_B, extend_B, penalize_extend_when_opening)
        score_matrix[i][0] = score
    for i in range(1, lenB):
        score = match_fn(sequenceA[0], sequenceB[i])
        if penalize_end_gaps:
            score += calc_affine_penalty(
                i, open_A, extend_A, penalize_extend_when_opening)
        score_matrix[0][i] = score

    # In the generic algorithm, at each row and column in the score
    # matrix, we had to scan all previous rows and columns to see
    # whether opening a gap might yield a higher score.  Here, since
    # we know the penalties are affine, we can cache just the best
    # score in the previous rows and columns.  Instead of scanning
    # through all the previous rows and cols, we can just look at the
    # cache for the best one.  Whenever the row or col increments, the
    # best cached score just decreases by extending the gap longer.

    # The best score and indexes for each row (goes down all columns).
    # I don't need to store the last row because it's the end of the
    # sequence.
    row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1)
    # The best score and indexes for each column (goes across rows).
    col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1)

    for i in range(lenA-1):
        # Initialize each row to be the alignment of sequenceA[i] to
        # sequenceB[0], plus opening a gap in sequenceA.
        row_cache_score[i] = score_matrix[i][0] + first_A_gap
        row_cache_index[i] = [(i, 0)]
    for i in range(lenB-1):
        col_cache_score[i] = score_matrix[0][i] + first_B_gap
        col_cache_index[i] = [(0, i)]
        
    # Fill in the score_matrix.
    for row in range(1, lenA):
        for col in range(1, lenB):
            # Calculate the score that would occur by extending the
            # alignment without gaps.
            nogap_score = score_matrix[row-1][col-1]
            
            # Check the score that would occur if there were a gap in
            # sequence A.
            if col > 1:
                row_score = row_cache_score[row-1]
            else:
                row_score = nogap_score - 1   # Make sure it's not the best.
            # Check the score that would occur if there were a gap in
            # sequence B.  
            if row > 1:
                col_score = col_cache_score[col-1]
            else:
                col_score = nogap_score - 1

            best_score = max(nogap_score, row_score, col_score)
            best_score_rint = rint(best_score)
            best_index = []
            if best_score_rint == rint(nogap_score):
                best_index.append((row-1, col-1))
            if best_score_rint == rint(row_score):
                best_index.extend(row_cache_index[row-1])
            if best_score_rint == rint(col_score):
                best_index.extend(col_cache_index[col-1])

            # Set the score and traceback matrices.
            score = best_score + match_fn(sequenceA[row], sequenceB[col])
            if not align_globally and score < 0:
                score_matrix[row][col] = 0
            else:
                score_matrix[row][col] = score
            trace_matrix[row][col] = best_index

            # Update the cached column scores.  The best score for
            # this can come from either extending the gap in the
            # previous cached score, or opening a new gap from the
            # most previously seen character.  Compare the two scores
            # and keep the best one.
            open_score = score_matrix[row-1][col-1] + first_B_gap
            extend_score = col_cache_score[col-1] + extend_B
            open_score_rint, extend_score_rint = \
                             rint(open_score), rint(extend_score)
            if open_score_rint > extend_score_rint:
                col_cache_score[col-1] = open_score
                col_cache_index[col-1] = [(row-1, col-1)]
            elif extend_score_rint > open_score_rint:
                col_cache_score[col-1] = extend_score
            else:
                col_cache_score[col-1] = open_score
                if (row-1, col-1) not in col_cache_index[col-1]:
                    col_cache_index[col-1] = col_cache_index[col-1] + \
                                             [(row-1, col-1)]

            # Update the cached row scores.
            open_score = score_matrix[row-1][col-1] + first_A_gap
            extend_score = row_cache_score[row-1] + extend_A
            open_score_rint, extend_score_rint = \
                             rint(open_score), rint(extend_score)
            if open_score_rint > extend_score_rint:
                row_cache_score[row-1] = open_score
                row_cache_index[row-1] = [(row-1, col-1)]
            elif extend_score_rint > open_score_rint:
                row_cache_score[row-1] = extend_score
            else:
                row_cache_score[row-1] = open_score
                if (row-1, col-1) not in row_cache_index[row-1]:
                    row_cache_index[row-1] = row_cache_index[row-1] + \
                                             [(row-1, col-1)]
                    
    return score_matrix, trace_matrix
Exemplo n.º 3
0
def _make_score_matrix_generic(
    sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 
    penalize_extend_when_opening, penalize_end_gaps, align_globally,
    score_only):
    # This is an implementation of the Needleman-Wunsch dynamic
    # programming algorithm for aligning sequences.
    
    # Create the score and traceback matrices.  These should be in the
    # shape:
    # sequenceA (down) x sequenceB (across)
    lenA, lenB = len(sequenceA), len(sequenceB)
    score_matrix, trace_matrix = [], []
    for i in range(lenA):
        score_matrix.append([None] * lenB)
        trace_matrix.append([[None]] * lenB)

    # The top and left borders of the matrices are special cases
    # because there are no previously aligned characters.  To simplify
    # the main loop, handle these separately.
    for i in range(lenA):
        # Align the first residue in sequenceB to the ith residue in
        # sequence A.  This is like opening up i gaps at the beginning
        # of sequence B.
        score = match_fn(sequenceA[i], sequenceB[0])
        if penalize_end_gaps:
            score += gap_B_fn(0, i)
        score_matrix[i][0] = score
    for i in range(1, lenB):
        score = match_fn(sequenceA[0], sequenceB[i])
        if penalize_end_gaps:
            score += gap_A_fn(0, i)
        score_matrix[0][i] = score

    # Fill in the score matrix.  Each position in the matrix
    # represents an alignment between a character from sequenceA to
    # one in sequence B.  As I iterate through the matrix, find the
    # alignment by choose the best of:
    #    1) extending a previous alignment without gaps
    #    2) adding a gap in sequenceA
    #    3) adding a gap in sequenceB
    for row in range(1, lenA):
        for col in range(1, lenB):
            # First, calculate the score that would occur by extending
            # the alignment without gaps.
            best_score = score_matrix[row-1][col-1]
            best_score_rint = rint(best_score)
            best_indexes = [(row-1, col-1)]

            # Try to find a better score by opening gaps in sequenceA.
            # Do this by checking alignments from each column in the
            # previous row.  Each column represents a different
            # character to align from, and thus a different length
            # gap.
            for i in range(0, col-1):
                score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i)
                score_rint = rint(score)
                if score_rint == best_score_rint:
                    best_score, best_score_rint = score, score_rint
                    best_indexes.append((row-1, i))
                elif score_rint > best_score_rint:
                    best_score, best_score_rint = score, score_rint
                    best_indexes = [(row-1, i)]
            
            # Try to find a better score by opening gaps in sequenceB.
            for i in range(0, row-1):
                score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i)
                score_rint = rint(score)
                if score_rint == best_score_rint:
                    best_score, best_score_rint = score, score_rint
                    best_indexes.append((i, col-1))
                elif score_rint > best_score_rint:
                    best_score, best_score_rint = score, score_rint
                    best_indexes = [(i, col-1)]

            score_matrix[row][col] = best_score + \
                                     match_fn(sequenceA[row], sequenceB[col])
            if not align_globally and score_matrix[row][col] < 0:
                score_matrix[row][col] = 0
            trace_matrix[row][col] = best_indexes
    return score_matrix, trace_matrix