def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function which will call other functions to create a UPGMA/WPGMA # tree based on the 'clustering' argument that is sent. newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # Call a function which prints the 3 strings on the console self.printTree(newickIds, clustering) # Return both the Newick output with distances and the output with just # the amino acid names. return newick, newickNoDistance, newickIds
def run(self, seq_fasta_fn): """ Fold RNA with Nussinov algorithm. Args: seq_fasta_fn: path to fasta file containing sequence Returns: tuple of (id_seq: fasta id of sequence, seq: sequence, structure: dot-bracket string of optimal folding) """ """This is the main function which parses fasta files, calls functions to create Needleman Wunsch and traceback matrices, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta files. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_fn) # if there is a problem with the fasta files, list(SeqIO.parse) # returns an empty list if len(record) == 0: print("You have a problem with your FASTA file. H" "int: check if the first character is '>'") sys.exit(1) # error code 1 id1 = record[0].id s1 = str(record[0].seq) # convert from Bio.Seq.Seq to str # Make sure s1 doesn't contain non-RNA characters if fr.validateRNASequence(s1) == 0: print("You have invalid character(s) in your file") sys.exit(11) # error code 11 # Build the Nussinov matrix N = self.build_matrix(s1) i = 0 j = len(s1) # Get the traceback: it is stored in the class-variables trace_list and # indices_list self.tracebackInN(N, s1, i, j) # The dot_bracket = self.printer(s1, id1) return N, dot_bracket
def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" self.subsMat = subst_matrix_fn self.gapOpenCost = cost_gap_open fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function in Xpgma which will call other functions to create # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent. # 2 Newick format outputs string are returned: with and without distances. # Only the one without distances will be used. The 3rd Newick format, # contains the original IDs in the fasta file, and is only needed for display gma = Xpgma() newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # seqClusterMap is a dict with cluster names as keys and the # corresponding sequences as values. seqClusterMap = {} cl = 0 for seq in s: seqClusterMap['C' + str(cl)] = seq cl += 1 # Call a function which will read and parse the Newick string, and # will internally call other functions to create groups and get # the final multiple sequence alignment. self.processNewickString(newickNoDistance, seqClusterMap) SOP = self.sumOfPairs() self.printer(newickIds, SOP) return SOP, newick, newickNoDistance