示例#1
0
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them

        # record  is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function which will call other functions to create a UPGMA/WPGMA
        # tree based on the 'clustering' argument that is sent.
        newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)

        # Call a function which prints the 3 strings on the console
        self.printTree(newickIds, clustering)
        # Return both the Newick output with distances and the output with just
        # the amino acid names.
        return newick, newickNoDistance, newickIds
示例#2
0
    def run(self, seq_fasta_fn):
        """
        Fold RNA with Nussinov algorithm.

        Args:
            seq_fasta_fn: path to fasta file containing sequence

        Returns:
            tuple of
            (id_seq: fasta id of sequence,
             seq: sequence,
             structure: dot-bracket string of optimal folding)
        """
        """This is the main function which parses fasta files,
            calls functions to create Needleman Wunsch and traceback
            matrices, and calls another function to print the final result"""

        fr = FriendClass()
        # Parse the fasta files. Get 2 sequences out of them
        # record is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_fn)
        # if there is a problem with the fasta files, list(SeqIO.parse)
        # returns an empty list
        if len(record) == 0:
            print("You have a problem with your FASTA file.  H"
                  "int: check if the first character is '>'")
            sys.exit(1)  # error code 1
        id1 = record[0].id
        s1 = str(record[0].seq)  # convert from Bio.Seq.Seq to str
        # Make sure s1 doesn't contain non-RNA characters
        if fr.validateRNASequence(s1) == 0:
            print("You have invalid character(s) in your file")
            sys.exit(11)  # error code 11

        # Build the Nussinov matrix
        N = self.build_matrix(s1)
        i = 0
        j = len(s1)
        # Get the traceback: it is stored in the class-variables trace_list and
        # indices_list
        self.tracebackInN(N, s1, i, j)
        # The
        dot_bracket = self.printer(s1, id1)
        return N, dot_bracket
    def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering):
        """This is the main function which parses the fasta file,
            calls functions to create the UPGMA and WPGMA trees, and
            calls another function to print the final result"""

        self.subsMat = subst_matrix_fn
        self.gapOpenCost = cost_gap_open
        fr = FriendClass()
        # Parse the fasta file. Get 2 sequences out of them
        # record is a list containing the sequences and ids in
        # the fasta file.
        record = fr.parseMultSequenceFastaFile(seq_fasta_file)
        # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list
        if len(record) == 0:
            print(
                "You have a problem with your FASTA file.  Hint: check if the first character is '>'"
            )
            sys.exit(1)  # error code 1

        # If gap cost is positive, take the additive inverse, return the
        # negative version of the same value.
        if cost_gap_open > 0:
            print(
                "Your gap cost is positive. I assume you want it to be negative, I have added a minus"
            )
            cost_gap_open = -cost_gap_open

        # The number of sequences is obtained from the length of the list 'record'
        num_sequences = len(record)
        # Get the pairwise similarities using Needleman-Wunsch.

        ids = []
        s = []
        for i in range(0, num_sequences):
            ids.append(record[i].id)
            s.append(str(record[i].seq))  # convert from Bio.Seq.Seq to str
            # Make sure s doesn't contain non-amino acid characters
            if fr.validateAminoSequence(s[i]) == 0:
                print("You have invalid character(s) in your FASTA file")
                sys.exit(11)  # error code 11

        # Call the function in Xpgma which will call other functions to create
        # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent.
        # 2 Newick format outputs string are returned: with and without distances.
        # Only the one without distances will be used. The 3rd Newick format,
        # contains the original IDs in the fasta file, and is only needed for display
        gma = Xpgma()
        newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma(
            ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering)
        # seqClusterMap is a dict with cluster names as keys and the
        # corresponding sequences as values.
        seqClusterMap = {}
        cl = 0
        for seq in s:
            seqClusterMap['C' + str(cl)] = seq
            cl += 1
        # Call a function which will read and parse the Newick string, and
        # will internally call other functions to create groups and get
        # the final multiple sequence alignment.
        self.processNewickString(newickNoDistance, seqClusterMap)
        SOP = self.sumOfPairs()
        self.printer(newickIds, SOP)
        return SOP, newick, newickNoDistance