Exemplo n.º 1
0
def infer_longest_peptide(masses):
    '''Returns the longest protein string that matches the spectrum graph of the given masses.'''
    # Build the graph from the given masses.
    graph = dict()
    protein_weight_dict = ProteinWeightDict()
    for i in xrange(len(masses)):
        for j in xrange(i+1, len(masses)):
            # Break the inner loop if we've exceeded the maximum weight.
            if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1:
                break

            # Check if the weight associated with masses i and j approximately matches a known protein.
            temp_protein = find_weight_match(masses[j] - masses[i], 0.001)
            if temp_protein is not None:
                graph[masses[i], masses[j]] = temp_protein

    # Get the topological ordering of the graph.
    top_order = topological_ordering(graph.keys())

    # Build the longest path to each node.
    S = {node: '' for node in top_order}
    for node in top_order:
        for predecessor in map(lambda n: n[0], filter(lambda e: e[1] == node, graph.keys())):
            if len(S[predecessor]) + 1 > len(S[node]):
                S[node] = S[predecessor] + graph[(predecessor, node)]

    # Return the longest path.
    return max(S.values(), key=len)
Exemplo n.º 2
0
def infer_longest_peptide(masses):
    '''Returns the longest protein string that matches the spectrum graph of the given masses.'''
    # Build the graph from the given masses.
    graph = dict()
    protein_weight_dict = ProteinWeightDict()
    for i in xrange(len(masses)):
        for j in xrange(i + 1, len(masses)):
            # Break the inner loop if we've exceeded the maximum weight.
            if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1:
                break

            # Check if the weight associated with masses i and j approximately matches a known protein.
            temp_protein = find_weight_match(masses[j] - masses[i], 0.001)
            if temp_protein is not None:
                graph[masses[i], masses[j]] = temp_protein

    # Get the topological ordering of the graph.
    top_order = topological_ordering(graph.keys())

    # Build the longest path to each node.
    S = {node: '' for node in top_order}
    for node in top_order:
        for predecessor in map(lambda n: n[0],
                               filter(lambda e: e[1] == node, graph.keys())):
            if len(S[predecessor]) + 1 > len(S[node]):
                S[node] = S[predecessor] + graph[(predecessor, node)]

    # Return the longest path.
    return max(S.values(), key=len)
def spectrum(peptide):
	'''Returns the linear spectrum of a given peptide.'''
	# Dictionary translating RNA to Protein
	weight = ProteinWeightDict()
	# Initialize as the mass 0 and the mass of the entire peptide.
	spec = [0, sum([int(weight[protein]) for protein in peptide])]
	# Find the masses of the adjacent intermediary subpeptides
	spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)]
	# Sort the list in ascending order and convert to strings.
	spec = map(str,sorted(spec))

	return spec
Exemplo n.º 4
0
def cyclospectrum(peptide):
    # Dictionary translating RNA to Protein
    weight = ProteinWeightDict()

    # Initialize as the mass 0 and the mass of the entire peptide.
    cyclospec = [0, sum([int(weight[protein]) for protein in peptide])]

    # Find the masses of the adjacent intermediary subpeptides
    cyclospec += [
        sum([int(weight[protein]) for protein in (peptide * 2)[j:j + i]])
        for i in xrange(1, len(peptide)) for j in xrange(len(peptide))
    ]

    # Sort the list in ascending order and convert to strings.
    cyclospec = map(str, sorted(cyclospec))

    return cyclospec
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 31 19:32:46 2016

@author: Johnqiu
"""
"""
问题1:真实的数据是按大小的顺序排列,模拟数据需不要排序?

"""
from scripts import ProteinWeightDict, IonTypeDict
import random
from operator import itemgetter

aa_table = ProteinWeightDict()
ion_table = IonTypeDict()

def simulatePeptide(pep_len):
    acids = [k for k in aa_table]
    peptide=[random.choice(acids) for i in range(pep_len)]
    peptide = ''.join(peptide)  # change list to String
    return peptide
    
def generateSpectrum(peptide, ion_table,intensity = 100):
    """
    Args:
       -peptide:   a peptide string
       -iontables:   {ion:(offset,prob)}
    """
    spectrum = []
    prefix_mass = 0
Exemplo n.º 6
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Calculating Protein Mass
Rosalind ID: PRTM
Rosalind #: 020
URL: http://rosalind.info/problems/prtm/
'''

from scripts import ProteinWeightDict

# Load the data.
file1 = open('data/rosalind_prtm.txt')
protein_str = file1.read().strip()
file1.close()

# Load the dictionary that translates protein to monoisotipic weight.
weight_dict = ProteinWeightDict()

# Calculate the weight protein by protein.
monoisotopic_weight = 0
for protein in protein_str:
    monoisotopic_weight += weight_dict[protein]

# Print and save the weight.
print monoisotopic_weight
with open('output/020_PRTM.txt', 'w') as output_data:
    output_data.write(str(monoisotopic_weight))
def spectrum_score(peptide, exp_spec):
	'''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.'''
	pep_spec = spectrum(peptide)
	# Return -1 if the peptide has more mass than exp_spec.
	if pep_spec[-1] > exp_spec[-1]:
		return -1
	return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)])

if __name__ == '__main__':

	with open('data/stepic_2e.txt') as input_data:
		n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())]
	
	# Create the protein weight dictionary.
	weight = ProteinWeightDict()
	# Initialize the scores dictionary.
	scores = dict()
	# Build the intial peptides.
	seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) 

	# Build the sequence until the masses all grow too large.
	while seq != []:
		# Store the scores of the current sequence in a dictionary.
		scores = dict()
		for item in seq:
			if item[0] in scores:
				scores[item[0]].append(item[1])
			else:
				scores[item[0]] = [item[1]]
Exemplo n.º 8
0
	# Dictionary translating RNA to Protein
	weight = ProteinWeightDict()
	# Initialize as the mass 0 and the mass of the entire peptide.
	spec = [0, sum([int(weight[protein]) for protein in peptide])]
	# Find the masses of the adjacent intermediary subpeptides
	spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)]
	# Sort the list in ascending order and convert to strings.
	spec = map(str,sorted(spec))

	return spec

with open('data/textbook/rosalind_2d.txt') as input_data:
	cyclospec = input_data.read().strip().split()

# Create the protein weight dictionary.
weight = ProteinWeightDict()

# Let n be the length of a given peptide, and L be the length of its cyclospectrum.  Then L = n(n-1) + 2.
# Using the quadratic formula to to solve for n:  n = (sqrt(4L-7) + 1)/2
n = int((sqrt(4*len(cyclospec)-7)+1)/2)

# Find the first n protein in the peptide.  
# Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries.
# Fortunately, no two small proteins masses add to that of a larger protein.
protein, i = [], 1
while len(protein) != n:
	if int(cyclospec[i]) in map(int,weight.values()):
		protein.append(cyclospec[i])
	i += 1

# Get the name of each protein corresponding to a given weight (if multiple, only take one).
Exemplo n.º 9
0
    # Find the masses of the adjacent intermediary subpeptides
    spec += [
        sum([int(weight[protein]) for protein in peptide[j:j + i]])
        for i in xrange(1, len(peptide)) for j in xrange(len(peptide) - i + 1)
    ]
    # Sort the list in ascending order and convert to strings.
    spec = map(str, sorted(spec))

    return spec


with open('data/textbook/rosalind_2d.txt') as input_data:
    cyclospec = input_data.read().strip().split()

# Create the protein weight dictionary.
weight = ProteinWeightDict()

# Let n be the length of a given peptide, and L be the length of its cyclospectrum.  Then L = n(n-1) + 2.
# Using the quadratic formula to to solve for n:  n = (sqrt(4L-7) + 1)/2
n = int((sqrt(4 * len(cyclospec) - 7) + 1) / 2)

# Find the first n protein in the peptide.
# Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries.
# Fortunately, no two small proteins masses add to that of a larger protein.
protein, i = [], 1
while len(protein) != n:
    if int(cyclospec[i]) in map(int, weight.values()):
        protein.append(cyclospec[i])
    i += 1

# Get the name of each protein corresponding to a given weight (if multiple, only take one).
Exemplo n.º 10
0
def append_protein(add_list):
    '''Returns a list containing all peptides from add_list with every possible protein suffix.'''
    newlist = []
    for item in add_list:
        newlist += [item + ch for ch in ProteinWeightDict().keys()]
    return newlist
Exemplo n.º 11
0
        min(pep_spec.count(protein), exp_spec.count(protein))
        for protein in set(pep_spec)
    ])


if __name__ == '__main__':

    with open('data/stepic_2e.txt') as input_data:
        n, spec = [
            int(line.strip()) if i == 0 else map(int,
                                                 line.strip().split())
            for i, line in enumerate(input_data.readlines())
        ]

    # Create the protein weight dictionary.
    weight = ProteinWeightDict()
    # Initialize the scores dictionary.
    scores = dict()
    # Build the intial peptides.
    seq = filter(lambda L: L[0] != -1,
                 [[spectrum_score(peptide, spec), peptide]
                  for peptide in append_protein(weight.keys())])

    # Build the sequence until the masses all grow too large.
    while seq != []:
        # Store the scores of the current sequence in a dictionary.
        scores = dict()
        for item in seq:
            if item[0] in scores:
                scores[item[0]].append(item[1])
            else:
Exemplo n.º 12
0
def find_weight_match(approx_weight, error):
    for item in ProteinWeightDict().items():
        if abs(item[1] - approx_weight) < error:
            return item[0]
    return None
Exemplo n.º 13
0
Problem Title: Inferring Protein from Spectrum
Rosalind ID: SPEC
Rosalind #: 053
URL: http://rosalind.info/problems/spec/
'''

from scripts import ProteinWeightDict

# The only major issue is that the given values aren't as precise as those in the table.
# Need to find the closest match (or rewrite the weight dictionary with less precision).
with open('data/rosalind_spec.txt') as input_data:
	masses = [float(line.strip()) for line in input_data.readlines()]

# Load a list of (protein, weight) pairs.
weight_list = ProteinWeightDict().items()

# Gives the difference between a given weight and the protein at position i in the weight list.
weight_diff = lambda (i, weight): abs(weight - weight_list[i][1])

# Returns the protein whose mass is closest to specified weight.
closest_prot = lambda weight: weight_list[min(zip(range(len(weight_list)), [weight]*len(weight_list)), key=weight_diff)[0]][0]

# Determine each protein.
prot = [closest_prot(masses[i+1]-masses[i]) for i in range(len(masses)-1)]

# Concatonate to get the desired protein.
print ''.join(prot)
with open('output/053_SPEC.txt', 'w') as output_data:
	output_data.write(''.join(prot))
Exemplo n.º 14
0
def spectrum_score(peptide, exp_spec):
	'''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.'''
	pep_spec = spectrum(peptide)
	# Return -1 if the peptide has more mass than exp_spec.
	if pep_spec[-1] > exp_spec[-1]:
		return -1
	return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)])

if __name__ == '__main__':

	with open('data/textbook/rosalind_2e.txt') as input_data:
		n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())]
	
	# Create the protein weight dictionary.
	weight = ProteinWeightDict()
	# Initialize the scores dictionary.
	scores = dict()
	# Build the intial peptides.
	seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) 

	# Build the sequence until the masses all grow too large.
	while seq != []:
		# Store the scores of the current sequence in a dictionary.
		scores = dict()
		for item in seq:
			if item[0] in scores:
				scores[item[0]].append(item[1])
			else:
				scores[item[0]] = [item[1]]