def run(self, seq1_fasta_file, seq2_fasta_file, subst_matrix_fn, cost_gap_open, complete_traceback): """This is the main function which parses fasta files, calls functions to create Needleman Wunsch and traceback matrices, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta files. Get sequences out of them, # record1 and record2 are lists containing the sequences and ids in # fasta file 1 and 2 respectively. record1, record2 = fr.parseFastaFiles(seq1_fasta_file, seq2_fasta_file) # if there is a problem with the fasta files, list(SeqIO.parse) returns an empty list if len(record1) == 0 or len(record2) == 0: print( "You have a problem with one of your FASTA files. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 id1 = record1[0].id s1 = str(record1[0].seq) # convert from Bio.Seq.Seq to str # Make sure s1 doesn't contain non-amino acid characters fr = FriendClass() if fr.validateAminoSequence(s1) == 0: print("You have invalid character(s) in your 1st file") sys.exit(11) # error code 11 id2 = record2[0].id s2 = str(record2[0].seq) # convert from Bio.Seq.Seq to str #Make sure s2 doesn't contain any non-amino acid characters if fr.validateAminoSequence(s2) == 0: print("You have invalid character(s) in your 2nd file") sys.exit(12) # error code 12 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open (traceback, optimalScore) = self.buildMatrices(s1, s2, subst_matrix_fn, cost_gap_open) alignment_strings = self.getAlignmentsFromTracebacks(s1, s2, traceback) num_alignments = len(alignment_strings) if complete_traceback == False: randomNum = random.randint(0, num_alignments - 1) alignment_strings = [alignment_strings[randomNum]] # Call a function which prints the 3 strings on the console self.printer(alignment_strings, num_alignments, optimalScore, complete_traceback, id1, id2, s1, s2, subst_matrix_fn) return (id1, s1, id2, s2, optimalScore, alignment_strings, num_alignments)
def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function which will call other functions to create a UPGMA/WPGMA # tree based on the 'clustering' argument that is sent. newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # Call a function which prints the 3 strings on the console self.printTree(newickIds, clustering) # Return both the Newick output with distances and the output with just # the amino acid names. return newick, newickNoDistance, newickIds
def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" self.subsMat = subst_matrix_fn self.gapOpenCost = cost_gap_open fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function in Xpgma which will call other functions to create # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent. # 2 Newick format outputs string are returned: with and without distances. # Only the one without distances will be used. The 3rd Newick format, # contains the original IDs in the fasta file, and is only needed for display gma = Xpgma() newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # seqClusterMap is a dict with cluster names as keys and the # corresponding sequences as values. seqClusterMap = {} cl = 0 for seq in s: seqClusterMap['C' + str(cl)] = seq cl += 1 # Call a function which will read and parse the Newick string, and # will internally call other functions to create groups and get # the final multiple sequence alignment. self.processNewickString(newickNoDistance, seqClusterMap) SOP = self.sumOfPairs() self.printer(newickIds, SOP) return SOP, newick, newickNoDistance