def infer_longest_peptide(masses): '''Returns the longest protein string that matches the spectrum graph of the given masses.''' # Build the graph from the given masses. graph = dict() protein_weight_dict = ProteinWeightDict() for i in xrange(len(masses)): for j in xrange(i+1, len(masses)): # Break the inner loop if we've exceeded the maximum weight. if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1: break # Check if the weight associated with masses i and j approximately matches a known protein. temp_protein = find_weight_match(masses[j] - masses[i], 0.001) if temp_protein is not None: graph[masses[i], masses[j]] = temp_protein # Get the topological ordering of the graph. top_order = topological_ordering(graph.keys()) # Build the longest path to each node. S = {node: '' for node in top_order} for node in top_order: for predecessor in map(lambda n: n[0], filter(lambda e: e[1] == node, graph.keys())): if len(S[predecessor]) + 1 > len(S[node]): S[node] = S[predecessor] + graph[(predecessor, node)] # Return the longest path. return max(S.values(), key=len)
def infer_longest_peptide(masses): '''Returns the longest protein string that matches the spectrum graph of the given masses.''' # Build the graph from the given masses. graph = dict() protein_weight_dict = ProteinWeightDict() for i in xrange(len(masses)): for j in xrange(i + 1, len(masses)): # Break the inner loop if we've exceeded the maximum weight. if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1: break # Check if the weight associated with masses i and j approximately matches a known protein. temp_protein = find_weight_match(masses[j] - masses[i], 0.001) if temp_protein is not None: graph[masses[i], masses[j]] = temp_protein # Get the topological ordering of the graph. top_order = topological_ordering(graph.keys()) # Build the longest path to each node. S = {node: '' for node in top_order} for node in top_order: for predecessor in map(lambda n: n[0], filter(lambda e: e[1] == node, graph.keys())): if len(S[predecessor]) + 1 > len(S[node]): S[node] = S[predecessor] + graph[(predecessor, node)] # Return the longest path. return max(S.values(), key=len)
with open('data/textbook/rosalind_2d.txt') as input_data: cyclospec = input_data.read().strip().split() # Create the protein weight dictionary. weight = ProteinWeightDict() # Let n be the length of a given peptide, and L be the length of its cyclospectrum. Then L = n(n-1) + 2. # Using the quadratic formula to to solve for n: n = (sqrt(4L-7) + 1)/2 n = int((sqrt(4*len(cyclospec)-7)+1)/2) # Find the first n protein in the peptide. # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries. # Fortunately, no two small proteins masses add to that of a larger protein. protein, i = [], 1 while len(protein) != n: if int(cyclospec[i]) in map(int,weight.values()): protein.append(cyclospec[i]) i += 1 # Get the name of each protein corresponding to a given weight (if multiple, only take one). names = [] for w in protein: names.append([items[0] for items in weight.items() if int(items[1])==int(w)][0]) # Build the possible sequences. seq = append_char(names,names) for repeat in xrange(1,n): seq = filter(lambda subpeptide:set(spectrum(subpeptide)) < set(cyclospec), set(seq)) if repeat != n-1: seq = append_char(seq,names)
with open('data/textbook/rosalind_2d.txt') as input_data: cyclospec = input_data.read().strip().split() # Create the protein weight dictionary. weight = ProteinWeightDict() # Let n be the length of a given peptide, and L be the length of its cyclospectrum. Then L = n(n-1) + 2. # Using the quadratic formula to to solve for n: n = (sqrt(4L-7) + 1)/2 n = int((sqrt(4 * len(cyclospec) - 7) + 1) / 2) # Find the first n protein in the peptide. # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries. # Fortunately, no two small proteins masses add to that of a larger protein. protein, i = [], 1 while len(protein) != n: if int(cyclospec[i]) in map(int, weight.values()): protein.append(cyclospec[i]) i += 1 # Get the name of each protein corresponding to a given weight (if multiple, only take one). names = [] for w in protein: names.append( [items[0] for items in weight.items() if int(items[1]) == int(w)][0]) # Build the possible sequences. seq = append_char(names, names) for repeat in xrange(1, n): seq = filter(lambda subpeptide: set(spectrum(subpeptide)) < set(cyclospec), set(seq)) if repeat != n - 1: