def sumOfPairs(self):
     """ This function gets the sum of pairs score of the multiple sequence
     alignment."""
     sumOfPairs = 0
     # Calls the FriendClass's getSubsMatScore function.
     fr = FriendClass()
     for i, alignment1 in enumerate(self.MSA):
         for j, alignment2 in enumerate(self.MSA):
             if i == j or i > j:
                 continue
             else:
                 # All the alignments are of the same length.
                 for index in range(len(alignment2)):
                     if alignment1[index] == 'X' and alignment2[
                             index] == 'X':
                         continue
                     elif ((alignment1[index] == 'X'
                            and alignment2[index] != 'X')
                           or (alignment1[index] != 'X'
                               and alignment2[index] == 'X')):
                         sumOfPairs += self.gapOpenCost
                     # if both of the alignments have amino-acid characters
                     else:
                         sumOfPairs += fr.getSubsMatScore(
                             alignment1[index], alignment2[index],
                             self.subsMat, self.gapOpenCost)
     return sumOfPairs
示例#2
0
    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_open_cost,
                      gap_extend_cost, g):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        # assign a high negative number to infinity, which will be used in initialization of P and Q matrices
        inf = -60000
        D = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # P matrix is used to extend gaps in Sequence 2
        P = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # Q matrix is used to extend gaps in Sequence 1
        Q = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        D[0, 1] = D[1, 0] = g
        P[0, 1] = inf
        Q[1, 0] = inf
        for i in range(2, s1_length + 1):
            D[i, 0] = D[i - 1, 0] + gap_extend_cost
            # P does not need to be initialized in the 1st column. These values are not used in the algorithm
            Q[i, 0] = inf
        for j in range(2, s2_length + 1):
            D[0, j] = D[0, j - 1] + gap_extend_cost
            P[0, j] = inf
            # Q does not need to be initialized in the 1st column. These values are not used in the algorithm
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                #D_i-1,j + g
                # Update P[i,j] -> we can either extend the gap from the previous row in P or create a new gap in Seq 1, which
                # means that we need to take the previous row's value in D (We don't take into account different values of j)
                P[i, j] = max(D[i - 1, j] + g, P[i - 1, j] + gap_extend_cost)
                #Next, update Q[i,j] -> we can either extend the gap from the previous col in Q or create a new gap in Seq 1, which
                # means that we need to take the previous col's value in D (We don't take into account different values of i)
                Q[i, j] = max(D[i, j - 1] + g, Q[i, j - 1] + gap_extend_cost)
                # Finally, update D[i,j]: it is the max of the substitution score (match/mismatch), and the resp. P[i,j] and Q[i,j],
                # which correspond to gap extension in seq 2 and seq 1 respectively
                substitution = D[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_extend_cost)
                D[i, j] = max(substitution, P[i, j], Q[i, j])

        optimalScore = D[s1_length][s2_length]
        return D, P, Q, optimalScore
    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_cost):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        nw_matrix = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        for i in range(1, s1_length + 1):
            nw_matrix[i, 0] = nw_matrix[i - 1, 0] + gap_cost
        for j in range(1, s2_length + 1):
            nw_matrix[0, j] = nw_matrix[0, j - 1] + gap_cost
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                # Cost of inserting a gap into seq 1
                seq1_gap = nw_matrix[i, j - 1] + gap_cost
                # Cost of inserting a gap into seq 2
                seq2_gap = nw_matrix[i - 1, j] + gap_cost
                # Cost of a match/mismatch
                # i-1, j-1 to index the strings as the i and j loops start with value 1
                substitution = nw_matrix[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_cost)

                nw_matrix[i][j] = max(seq1_gap, seq2_gap, substitution)
                # Store which direction we came from, we need this for traceback
                # traceback is a s1.length x s2.length matrix, so we need to index
                # from [0][0], so we use [i-1][j-1]
                """ We add 1 whenever the value was caluclated from seq1_gap, we add 2
                when the value was calculated from seq2_gap, and add 4 when the value
                was calculated from a substitution. We get the values 5,6,7 when the
                value came from 2 or 3 directions (i.e. combinations of seq1_gap, seq2_gap
                and substitutions (1,2 and 4). Note that there are three ifs and not elifs,
                so all 3 have conditions are checked, and the values are added."""

                if seq1_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 1
                if seq2_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 2
                if substitution == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 4
        optimalScore = nw_matrix[s1_length][s2_length]
        return traceback, optimalScore
示例#4
0
    def similarityToDistance(self, s_ab, a, b, nw, alignment, subsMat,
                             gapOpenCost):
        """ This function converts a similarity score to a distance score."""

        # 1. Calculate S(a,b)_rand using the formula on this page, but with linear gap costs:
        # http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Feng-Doolittle
        # Find length of the sequence
        L = len(alignment[0])  # same length for alignment[0],[1] and [2]
        # Find number of gaps in alignment[0] and alignment[2]
        N_g = alignment[0].count('-') + alignment[2].count('-')
        fr = FriendClass()
        sum_xy = 0
        # Randomize a and b to calculate s_rand.
        list_a = list(a)
        list_b = list(b)
        random.shuffle(list_a)
        random.shuffle(list_b)
        rand_a = "".join(list_a)
        rand_b = "".join(list_b)
        for i, x in enumerate(a):
            for j, y in enumerate(b):
                s_xy = fr.getSubsMatScore(rand_a[i], rand_b[j], subsMat,
                                          gapOpenCost)
                Na_x = a.count(x)
                Nb_y = b.count(y)
                sum_xy += (Na_x * Nb_y * s_xy)
        s_ab_rand = (sum_xy / L) + (N_g * gapOpenCost)

        # 2. Calculate s_ab_max
        (traceback_aa, s_aa) = nw.buildMatrices(a, a, subsMat, gapOpenCost)
        (traceback_bb, s_bb) = nw.buildMatrices(b, b, subsMat, gapOpenCost)
        s_ab_max = (s_aa + s_bb) / 2
        #s_ab_eff is the normalized similarity: between 0 and 1.
        s_ab_eff = (s_ab - s_ab_rand) / (s_ab_max - s_ab_rand)
        d = -math.log(s_ab_eff)
        return d
示例#5
0
    def getAlignmentsFromTracebacks(self, s1, s2, subst_matrix_fn, D, P, Q,
                                    alpha, beta, g):
        """This function takes as input the matrices D, P and Q created in an earlier function. It
        computes the traceback by essentially reversing the process of building the matrices.
         It returns a list of lists containing the alignment."""

        indices_list = [[]]
        trace_list = [[]]
        fr = FriendClass()
        # Set i and j to the index of the last row and column of the ndarray D respectively
        i = D.shape[0] - 1
        j = D.shape[1] - 1
        indices_list[0] = [i, j]
        trace_list[0] = ["", "", ""]
        indices_duplicate = copy.deepcopy(
            indices_list
        )  # A copy of indices list is needed for going through the for loop below
        while True:
            completed_counter = 0  #This counter will be set to the number of tracebacks found.
            for index, [i, j] in enumerate(indices_duplicate):

                if i == 0 and j == 0:
                    # We reach here only when we have got the complete sequence
                    completed_counter += 1  #increment indicates that we have got 1 more complete traceback
                    continue

                if i == 0 and j >= 0:
                    # We reach here only when s1 has reached the beginning of the sequence
                    trace_list[index][0] += '-'
                    trace_list[index][1] += s2[j]
                    trace_list[index][2] += ' '
                    indices_list[index][1] -= 1
                    continue

                if i >= 0 and j == 0:
                    # We reach here only when s2 has reached the beginning of the sequence
                    trace_list[index][0] += s1[i]
                    trace_list[index][1] += '-'
                    trace_list[index][2] += ' '
                    indices_list[index][0] -= 1
                    continue
                # indicates that the value in D[i,j] came from P, Q and D
                if D[i, j] == P[i, j] and D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    trace_list.append(copy.deepcopy(trace_list[index]))
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #first copy
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #second copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    trace_list[index][0] += s1[i2:i]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                    # treat traceback[third] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    third = len(trace_list) - 2
                    trace_list[third][0] += s1[i - 1]
                    trace_list[third][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[third][2] += '*'
                    else:
                        trace_list[third][2] += ':'
                    indices_list[third][0] -= 1
                    indices_list[third][1] -= 1

                # The value in D[i,j] came from P and Q, not from D.
                elif D[i, j] == P[i, j] and D[i, j] == Q[i, j]:
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                # D[i,j] came from P and D
                elif D[i, j] == P[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from D and Q.
                elif D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from only P
                #indicates that a gap has been added in Sequence 1, so we have to decrement i

                elif D[i, j] == P[i, j]:
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                # D[i,j] came from only Q.
                #indicates that a gap has been added in Sequence 2, so we have to decrement i
                elif D[i, j] == Q[i, j]:
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                # D[i,j] came from D[i-1,j-1]
                # Indicates a substitution
                elif D[i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                        s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list[index][0] += s1[i - 1]
                    trace_list[index][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[index][2] += '*'
                    else:
                        trace_list[index][2] += ':'
                    indices_list[index][0] -= 1
                    indices_list[index][1] -= 1

            # indices_duplicate, the for loop variable, needs to store the updated value of indices_list before the next loop starts
            indices_duplicate = copy.deepcopy(indices_list)
            # when the number of indices (same as no. of tracebacks) is equal to the 'done counter', which is incremented once for
            # each traceback, we can break out of the while(True) infinite loop
            if completed_counter == len(indices_duplicate):
                break
        # As trace_list contains all the strings (S1, S2 and connect) in the opposite order, they need to be reversed.
        alignment_strings = [[string[::-1] for string in trace]
                             for trace in trace_list]
        return alignment_strings