예제 #1
0
 def sumOfPairs(self):
     """ This function gets the sum of pairs score of the multiple sequence
     alignment."""
     sumOfPairs = 0
     # Calls the FriendClass's getSubsMatScore function.
     fr = FriendClass()
     for i, alignment1 in enumerate(self.MSA):
         for j, alignment2 in enumerate(self.MSA):
             if i == j or i > j:
                 continue
             else:
                 # All the alignments are of the same length.
                 for index in range(len(alignment2)):
                     if alignment1[index] == 'X' and alignment2[
                             index] == 'X':
                         continue
                     elif ((alignment1[index] == 'X'
                            and alignment2[index] != 'X')
                           or (alignment1[index] != 'X'
                               and alignment2[index] == 'X')):
                         sumOfPairs += self.gapOpenCost
                     # if both of the alignments have amino-acid characters
                     else:
                         sumOfPairs += fr.getSubsMatScore(
                             alignment1[index], alignment2[index],
                             self.subsMat, self.gapOpenCost)
     return sumOfPairs
예제 #2
0
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them

        # record  is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function which will call other functions to create a UPGMA/WPGMA
        # tree based on the 'clustering' argument that is sent.
        newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)

        # Call a function which prints the 3 strings on the console
        self.printTree(newickIds, clustering)
        # Return both the Newick output with distances and the output with just
        # the amino acid names.
        return newick, newickNoDistance, newickIds
예제 #3
0
    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_open_cost,
                      gap_extend_cost, g):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        # assign a high negative number to infinity, which will be used in initialization of P and Q matrices
        inf = -60000
        D = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # P matrix is used to extend gaps in Sequence 2
        P = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # Q matrix is used to extend gaps in Sequence 1
        Q = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        D[0, 1] = D[1, 0] = g
        P[0, 1] = inf
        Q[1, 0] = inf
        for i in range(2, s1_length + 1):
            D[i, 0] = D[i - 1, 0] + gap_extend_cost
            # P does not need to be initialized in the 1st column. These values are not used in the algorithm
            Q[i, 0] = inf
        for j in range(2, s2_length + 1):
            D[0, j] = D[0, j - 1] + gap_extend_cost
            P[0, j] = inf
            # Q does not need to be initialized in the 1st column. These values are not used in the algorithm
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                #D_i-1,j + g
                # Update P[i,j] -> we can either extend the gap from the previous row in P or create a new gap in Seq 1, which
                # means that we need to take the previous row's value in D (We don't take into account different values of j)
                P[i, j] = max(D[i - 1, j] + g, P[i - 1, j] + gap_extend_cost)
                #Next, update Q[i,j] -> we can either extend the gap from the previous col in Q or create a new gap in Seq 1, which
                # means that we need to take the previous col's value in D (We don't take into account different values of i)
                Q[i, j] = max(D[i, j - 1] + g, Q[i, j - 1] + gap_extend_cost)
                # Finally, update D[i,j]: it is the max of the substitution score (match/mismatch), and the resp. P[i,j] and Q[i,j],
                # which correspond to gap extension in seq 2 and seq 1 respectively
                substitution = D[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_extend_cost)
                D[i, j] = max(substitution, P[i, j], Q[i, j])

        optimalScore = D[s1_length][s2_length]
        return D, P, Q, optimalScore
    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_cost):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        nw_matrix = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        for i in range(1, s1_length + 1):
            nw_matrix[i, 0] = nw_matrix[i - 1, 0] + gap_cost
        for j in range(1, s2_length + 1):
            nw_matrix[0, j] = nw_matrix[0, j - 1] + gap_cost
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                # Cost of inserting a gap into seq 1
                seq1_gap = nw_matrix[i, j - 1] + gap_cost
                # Cost of inserting a gap into seq 2
                seq2_gap = nw_matrix[i - 1, j] + gap_cost
                # Cost of a match/mismatch
                # i-1, j-1 to index the strings as the i and j loops start with value 1
                substitution = nw_matrix[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_cost)

                nw_matrix[i][j] = max(seq1_gap, seq2_gap, substitution)
                # Store which direction we came from, we need this for traceback
                # traceback is a s1.length x s2.length matrix, so we need to index
                # from [0][0], so we use [i-1][j-1]
                """ We add 1 whenever the value was caluclated from seq1_gap, we add 2
                when the value was calculated from seq2_gap, and add 4 when the value
                was calculated from a substitution. We get the values 5,6,7 when the
                value came from 2 or 3 directions (i.e. combinations of seq1_gap, seq2_gap
                and substitutions (1,2 and 4). Note that there are three ifs and not elifs,
                so all 3 have conditions are checked, and the values are added."""

                if seq1_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 1
                if seq2_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 2
                if substitution == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 4
        optimalScore = nw_matrix[s1_length][s2_length]
        return traceback, optimalScore
예제 #5
0
    def run(self, seq_fasta_fn):
        """
        Fold RNA with Nussinov algorithm.

        Args:
            seq_fasta_fn: path to fasta file containing sequence

        Returns:
            tuple of
            (id_seq: fasta id of sequence,
             seq: sequence,
             structure: dot-bracket string of optimal folding)
        """
        """This is the main function which parses fasta files,
            calls functions to create Needleman Wunsch and traceback
            matrices, and calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta files. Get 2 sequences out of them
        # record is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_fn)
        # if there is a problem with the fasta files, list(SeqIO.parse)
        # returns an empty list
        if len(record) == 0:
            print("You have a problem with your FASTA file.  H"
                  "int: check if the first character is '>'")
            sys.exit(1)  # error code 1
        id1 = record[0].id
        s1 = str(record[0].seq)  # convert from Bio.Seq.Seq to str
        # Make sure s1 doesn't contain non-RNA characters
        if fr.validateRNASequence(s1) == 0:
            print("You have invalid character(s) in your file")
            sys.exit(11)  # error code 11

        # Build the Nussinov matrix
        N = self.build_matrix(s1)
        i = 0
        j = len(s1)
        # Get the traceback: it is stored in the class-variables trace_list and
        # indices_list
        self.tracebackInN(N, s1, i, j)
        # The
        dot_bracket = self.printer(s1, id1)
        return N, dot_bracket
    def run(self, seq1_fasta_file, seq2_fasta_file, subst_matrix_fn,
            cost_gap_open, complete_traceback):
        """This is the main function which parses fasta files,
            calls functions to create Needleman Wunsch and traceback
            matrices, and calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta files. Get sequences out of them,
        # record1 and record2 are lists containing the sequences and ids in
        # fasta file 1 and 2 respectively.
        record1, record2 = fr.parseFastaFiles(seq1_fasta_file, seq2_fasta_file)
        # if there is a problem with the fasta files, list(SeqIO.parse) returns an empty list
        if len(record1) == 0 or len(record2) == 0:
            print(
                "You have a problem with one of your FASTA files.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1
        id1 = record1[0].id
        s1 = str(record1[0].seq)  # convert from Bio.Seq.Seq to str
        # Make sure s1 doesn't contain non-amino acid characters
        fr = FriendClass()
        if fr.validateAminoSequence(s1) == 0:
            print("You have invalid character(s) in your 1st file")
            sys.exit(11)  # error code 11
        id2 = record2[0].id
        s2 = str(record2[0].seq)  # convert from Bio.Seq.Seq to str
        #Make sure s2 doesn't contain any non-amino acid characters
        if fr.validateAminoSequence(s2) == 0:
            print("You have invalid character(s) in your 2nd file")
            sys.exit(12)  # error code 12

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        (traceback, optimalScore) = self.buildMatrices(s1, s2, subst_matrix_fn,
                                                       cost_gap_open)
        alignment_strings = self.getAlignmentsFromTracebacks(s1, s2, traceback)
        num_alignments = len(alignment_strings)

        if complete_traceback == False:
            randomNum = random.randint(0, num_alignments - 1)
            alignment_strings = [alignment_strings[randomNum]]

        # Call a function which prints the 3 strings on the console
        self.printer(alignment_strings, num_alignments, optimalScore,
                     complete_traceback, id1, id2, s1, s2, subst_matrix_fn)
        return (id1, s1, id2, s2, optimalScore, alignment_strings,
                num_alignments)
예제 #7
0
    def similarityToDistance(self, s_ab, a, b, nw, alignment, subsMat,
                             gapOpenCost):
        """ This function converts a similarity score to a distance score."""

        # 1. Calculate S(a,b)_rand using the formula on this page, but with linear gap costs:
        # http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Feng-Doolittle
        # Find length of the sequence
        L = len(alignment[0])  # same length for alignment[0],[1] and [2]
        # Find number of gaps in alignment[0] and alignment[2]
        N_g = alignment[0].count('-') + alignment[2].count('-')
        fr = FriendClass()
        sum_xy = 0
        # Randomize a and b to calculate s_rand.
        list_a = list(a)
        list_b = list(b)
        random.shuffle(list_a)
        random.shuffle(list_b)
        rand_a = "".join(list_a)
        rand_b = "".join(list_b)
        for i, x in enumerate(a):
            for j, y in enumerate(b):
                s_xy = fr.getSubsMatScore(rand_a[i], rand_b[j], subsMat,
                                          gapOpenCost)
                Na_x = a.count(x)
                Nb_y = b.count(y)
                sum_xy += (Na_x * Nb_y * s_xy)
        s_ab_rand = (sum_xy / L) + (N_g * gapOpenCost)

        # 2. Calculate s_ab_max
        (traceback_aa, s_aa) = nw.buildMatrices(a, a, subsMat, gapOpenCost)
        (traceback_bb, s_bb) = nw.buildMatrices(b, b, subsMat, gapOpenCost)
        s_ab_max = (s_aa + s_bb) / 2
        #s_ab_eff is the normalized similarity: between 0 and 1.
        s_ab_eff = (s_ab - s_ab_rand) / (s_ab_max - s_ab_rand)
        d = -math.log(s_ab_eff)
        return d
예제 #8
0
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        self.subsMat = subst_matrix_fn
        self.gapOpenCost = cost_gap_open
        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them
        # record is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function in Xpgma which will call other functions to create
        # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent.
        # 2 Newick format outputs string are returned: with and without distances.
        # Only the one without distances will be used. The 3rd Newick format,
        # contains the original IDs in the fasta file, and is only needed for display
        gma = Xpgma()
        newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)
        # seqClusterMap is a dict with cluster names as keys and the
        # corresponding sequences as values.
        seqClusterMap = {}
        cl = 0
        for seq in s:
            seqClusterMap['C' + str(cl)] = seq
            cl += 1
        # Call a function which will read and parse the Newick string, and
        # will internally call other functions to create groups and get
        # the final multiple sequence alignment.
        self.processNewickString(newickNoDistance, seqClusterMap)
        SOP = self.sumOfPairs()
        self.printer(newickIds, SOP)
        return SOP, newick, newickNoDistance
예제 #9
0
    def getAlignmentsFromTracebacks(self, s1, s2, subst_matrix_fn, D, P, Q,
                                    alpha, beta, g):
        """This function takes as input the matrices D, P and Q created in an earlier function. It
        computes the traceback by essentially reversing the process of building the matrices.
         It returns a list of lists containing the alignment."""

        indices_list = [[]]
        trace_list = [[]]
        fr = FriendClass()
        # Set i and j to the index of the last row and column of the ndarray D respectively
        i = D.shape[0] - 1
        j = D.shape[1] - 1
        indices_list[0] = [i, j]
        trace_list[0] = ["", "", ""]
        indices_duplicate = copy.deepcopy(
            indices_list
        )  # A copy of indices list is needed for going through the for loop below
        while True:
            completed_counter = 0  #This counter will be set to the number of tracebacks found.
            for index, [i, j] in enumerate(indices_duplicate):

                if i == 0 and j == 0:
                    # We reach here only when we have got the complete sequence
                    completed_counter += 1  #increment indicates that we have got 1 more complete traceback
                    continue

                if i == 0 and j >= 0:
                    # We reach here only when s1 has reached the beginning of the sequence
                    trace_list[index][0] += '-'
                    trace_list[index][1] += s2[j]
                    trace_list[index][2] += ' '
                    indices_list[index][1] -= 1
                    continue

                if i >= 0 and j == 0:
                    # We reach here only when s2 has reached the beginning of the sequence
                    trace_list[index][0] += s1[i]
                    trace_list[index][1] += '-'
                    trace_list[index][2] += ' '
                    indices_list[index][0] -= 1
                    continue
                # indicates that the value in D[i,j] came from P, Q and D
                if D[i, j] == P[i, j] and D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    trace_list.append(copy.deepcopy(trace_list[index]))
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #first copy
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #second copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    trace_list[index][0] += s1[i2:i]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                    # treat traceback[third] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    third = len(trace_list) - 2
                    trace_list[third][0] += s1[i - 1]
                    trace_list[third][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[third][2] += '*'
                    else:
                        trace_list[third][2] += ':'
                    indices_list[third][0] -= 1
                    indices_list[third][1] -= 1

                # The value in D[i,j] came from P and Q, not from D.
                elif D[i, j] == P[i, j] and D[i, j] == Q[i, j]:
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                # D[i,j] came from P and D
                elif D[i, j] == P[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from D and Q.
                elif D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from only P
                #indicates that a gap has been added in Sequence 1, so we have to decrement i

                elif D[i, j] == P[i, j]:
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                # D[i,j] came from only Q.
                #indicates that a gap has been added in Sequence 2, so we have to decrement i
                elif D[i, j] == Q[i, j]:
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                # D[i,j] came from D[i-1,j-1]
                # Indicates a substitution
                elif D[i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                        s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list[index][0] += s1[i - 1]
                    trace_list[index][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[index][2] += '*'
                    else:
                        trace_list[index][2] += ':'
                    indices_list[index][0] -= 1
                    indices_list[index][1] -= 1

            # indices_duplicate, the for loop variable, needs to store the updated value of indices_list before the next loop starts
            indices_duplicate = copy.deepcopy(indices_list)
            # when the number of indices (same as no. of tracebacks) is equal to the 'done counter', which is incremented once for
            # each traceback, we can break out of the while(True) infinite loop
            if completed_counter == len(indices_duplicate):
                break
        # As trace_list contains all the strings (S1, S2 and connect) in the opposite order, they need to be reversed.
        alignment_strings = [[string[::-1] for string in trace]
                             for trace in trace_list]
        return alignment_strings
예제 #10
0
                               and alignment2[index] != 'X')
                              or (alignment1[index] != 'X'
                                  and alignment2[index] == 'X')):
                            sumOfPairs += gapOpenCost
                        # if both of the alignments have amino-acid characters
                        else:
                            sumOfPairs += fr.getSubsMatScore(
                                alignment1[index], alignment2[index], subsMat,
                                gapOpenCost)
        return sumOfPairs


if __name__ == '__main__':

    sop = sumOfPairs()
    fr = FriendClass()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "subsMatrixType",
        choices=["pam250", "blosum62"],
        help="Choose if you want to use a PAM250 or BLOSUM62 substitution"
        " matrix for calculating match/mismatch score")
    parser.add_argument("gapOpenCost",
                        type=int,
                        help="Specify the cost of opening a gap")
    args = parser.parse_args()

    # Hard code the alignments for testing, as Needleman Wunsch otherwise
    # gives randomized alignments.
    MSA_PAM = [
        "---MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAA",