def run(self, seq1_fasta_file, seq2_fasta_file, subst_matrix_fn,
            cost_gap_open, complete_traceback):
        """This is the main function which parses fasta files,
            calls functions to create Needleman Wunsch and traceback
            matrices, and calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta files. Get sequences out of them,
        # record1 and record2 are lists containing the sequences and ids in
        # fasta file 1 and 2 respectively.
        record1, record2 = fr.parseFastaFiles(seq1_fasta_file, seq2_fasta_file)
        # if there is a problem with the fasta files, list(SeqIO.parse) returns an empty list
        if len(record1) == 0 or len(record2) == 0:
            print(
                "You have a problem with one of your FASTA files.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1
        id1 = record1[0].id
        s1 = str(record1[0].seq)  # convert from Bio.Seq.Seq to str
        # Make sure s1 doesn't contain non-amino acid characters
        fr = FriendClass()
        if fr.validateAminoSequence(s1) == 0:
            print("You have invalid character(s) in your 1st file")
            sys.exit(11)  # error code 11
        id2 = record2[0].id
        s2 = str(record2[0].seq)  # convert from Bio.Seq.Seq to str
        #Make sure s2 doesn't contain any non-amino acid characters
        if fr.validateAminoSequence(s2) == 0:
            print("You have invalid character(s) in your 2nd file")
            sys.exit(12)  # error code 12

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        (traceback, optimalScore) = self.buildMatrices(s1, s2, subst_matrix_fn,
                                                       cost_gap_open)
        alignment_strings = self.getAlignmentsFromTracebacks(s1, s2, traceback)
        num_alignments = len(alignment_strings)

        if complete_traceback == False:
            randomNum = random.randint(0, num_alignments - 1)
            alignment_strings = [alignment_strings[randomNum]]

        # Call a function which prints the 3 strings on the console
        self.printer(alignment_strings, num_alignments, optimalScore,
                     complete_traceback, id1, id2, s1, s2, subst_matrix_fn)
        return (id1, s1, id2, s2, optimalScore, alignment_strings,
                num_alignments)
示例#2
0
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them

        # record  is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function which will call other functions to create a UPGMA/WPGMA
        # tree based on the 'clustering' argument that is sent.
        newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)

        # Call a function which prints the 3 strings on the console
        self.printTree(newickIds, clustering)
        # Return both the Newick output with distances and the output with just
        # the amino acid names.
        return newick, newickNoDistance, newickIds
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        self.subsMat = subst_matrix_fn
        self.gapOpenCost = cost_gap_open
        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them
        # record is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function in Xpgma which will call other functions to create
        # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent.
        # 2 Newick format outputs string are returned: with and without distances.
        # Only the one without distances will be used. The 3rd Newick format,
        # contains the original IDs in the fasta file, and is only needed for display
        gma = Xpgma()
        newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)
        # seqClusterMap is a dict with cluster names as keys and the
        # corresponding sequences as values.
        seqClusterMap = {}
        cl = 0
        for seq in s:
            seqClusterMap['C' + str(cl)] = seq
            cl += 1
        # Call a function which will read and parse the Newick string, and
        # will internally call other functions to create groups and get
        # the final multiple sequence alignment.
        self.processNewickString(newickNoDistance, seqClusterMap)
        SOP = self.sumOfPairs()
        self.printer(newickIds, SOP)
        return SOP, newick, newickNoDistance