def fragments(peptide, types=('b', 'y'), max_charge=1): ''' Function that returns theoretical fragments of peptide. Modeled from : https://pyteomics.readthedocs.io/en/latest/examples/example_msms.html :param peptide: (str) peptide sequence :param types: (tuple) types of fragments desired :param max_charge: (int) maximum charge state of fragment ions ''' d = {} for ion_type in types: d[ion_type] = [] for i in range(1, len(peptide)): for charge in range(1, max_charge + 1): if ion_type[0] in 'abc': if i == 0: continue m = mass.fast_mass(peptide[:i], ion_type=ion_type, charge=charge) else: m = mass.fast_mass(peptide[i:], ion_type=ion_type, charge=charge) d[ion_type].append(m) return d
def getIonMasses(peptide, types=('b', 'y'), maxcharge=2): """ The function generates all possible m/z for fragments of types `types` and of charges from 1 to `maxharge`. """ ions = { "b1": [], "b2": [], "bn1": [], "bn2": [], "bo1": [], "bo2": [], "y1": [], "y2": [], "yn1": [] } losses = ['', 'n', 'o'] for ion_type in types: for charge in range(1, maxcharge + 1): for lossT in losses: key = ion_type + lossT + str(charge) loss = lossConvert(lossT, charge) for i in range(1, len(peptide) - 1): if ion_type[0] in 'abc': ions[key].append( massC.fast_mass(peptide[:i], ion_type=ion_type, charge=charge)) else: ions[key].append( massC.fast_mass(peptide[i:], ion_type=ion_type, charge=charge)) return ions
def _fragments(self, peptide, types=("b", "y"), maxcharge=1): for i in range(1, len(peptide) - 1): for ion_type in types: for charge in range(1, maxcharge + 1): if ion_type[0] in "abc": yield mass.fast_mass( peptide[:i], ion_type=ion_type, charge=charge ) else: yield mass.fast_mass( peptide[i:], ion_type=ion_type, charge=charge )
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]): amino_acid_list = get_peptide_modification_list_inspect_format(peptide) #print(amino_acid_list) only_letters_list = [letter for letter in peptide if letter.isalpha()] only_mods_mass_add_list = [] for amino_acid in amino_acid_list: mod_mass_to_add = 0.0 mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid)) for mod_tokenized in mod_strings_tokenized: mod_mass_to_add += float(mod_tokenized) only_mods_mass_add_list.append(mod_mass_to_add) ion_to_mass_mapping = {} #print(peptide) #print(only_mods_mass_add_list) for charge in charge_set: for ion_type in ion_type_list: #print(ion_type) iso_topic_added_mass = 0.0 real_ion_type = ion_type if ion_type[-4:] == "-iso": iso_topic_added_mass = 1.007276 / float(charge) real_ion_type = ion_type[:-4] for i in range(len(amino_acid_list)): peak_mass = 0.0 if real_ion_type[0] in "abc": peak_annotation = ion_type + ":" + str(i + 1) + ":" + str( charge) peak_mass = mass.fast_mass( "".join(only_letters_list[:i + 1]), ion_type=real_ion_type, charge=charge) + sum( only_mods_mass_add_list[:i + 1]) / ( float(charge)) + iso_topic_added_mass #print(ion_type, i, charge, peak_mass, real_ion_type) else: peak_annotation = ion_type + ":" + str( len(amino_acid_list) - i) + ":" + str(charge) peak_mass = mass.fast_mass( "".join(only_letters_list[i:]), ion_type=real_ion_type, charge=charge) + sum(only_mods_mass_add_list[i:]) / ( float(charge)) + iso_topic_added_mass #print(ion_type, i, charge, peak_mass) ion_to_mass_mapping[peak_annotation] = peak_mass return ion_to_mass_mapping
def fragments(peptide, types=('b', 'y'), maxcharge=1): """ The function generates all possible m/z for fragments of types `types` and of charges from 1 to `maxharge`. """ for i in xrange(1, len(peptide) - 1): for ion_type in types: for charge in xrange(1, maxcharge + 1): if ion_type[0] in 'abc': yield mass.fast_mass(peptide[:i], ion_type=ion_type, charge=charge) else: yield mass.fast_mass(peptide[i:], ion_type=ion_type, charge=charge)
def calc_precursor_mz(self, peptide, modifications, charge): """ Calculate precursor mass and mz for given peptide and modification list, taking the modifications into account. Note: This method does not use the build-in Pyteomics modification handling, as that would require a known atomic composition of the modification. Parameters ---------- peptide: str stripped peptide sequence modifications: str MS2PIP-style formatted modifications list (e.g. `0|Acetyl|2|Oxidation`) charge: int precursor charge Returns ------- prec_mass, prec_mz: tuple(float, float) """ charge = int(charge) unmodified_mass = mass.fast_mass(peptide) mods_massses = sum( [self.mass_shifts[mod] for mod in modifications.split("|")[1::2]]) prec_mass = unmodified_mass + mods_massses prec_mz = (prec_mass + charge * PROTON_MASS) / charge return prec_mass, prec_mz
def mgf_library_upload_quant(fileName, scanDict, digDict, aaDict, maxPeaks): # mgf file is read in using the pyteomics mgf module libMGF = mgf.read(fileName) # return value is initialized lib = defaultdict(list) keyList = sorted(list(scanDict.keys())) # each spectrum in the mgf file for spec in libMGF: seq = spec['params']['seq'] precMz = spec['params']['pepmass'][0] key = (round(precMz, 2), seq) if key not in scanDict: continue # Decimal values are replaced with numeric placeholders to be included in the analysis. sequence = re.sub(r'\+\d+\.\d+', lambda m: digDict.get(m.group()), seq) # peaks of the library file are intialized mz = list(spec['m/z array']) intensity = [x for x in list(spec['intensity array'])] z = spec['params']['charge'][0] # The y-ion mz value for each fragment of the peptide is calculated. If it is in the library, it and it's intensity are stored in a list # NOTE: y-ions are singled out because they should have at least one lysine or arginine, so will have a heavy counterpart that can show up. B-ions don't have that guarantee. fragList = [] for x in range(1, len(sequence) - 1): fragseq = sequence[x:] lightfragmz = mass.fast_mass( sequence=sequence[x:], ion_type='y', charge=1, aa_mass=aaDict) # Do I need to use different possible charges? i = smf.approx_list(lightfragmz, mz) if i == -1: continue fragList.append((intensity[i], lightfragmz, fragseq)) # y-ion peaks are sorted by intensity, and lower-intensity peaks are filtered out. fragList.sort(reverse=True) if maxPeaks != 0 and len(fragList) >= maxPeaks: fragList = fragList[:maxPeaks] # heavy counterpart mz is calculated. Light and heavy pairs are additionally tagged by their intensity rank and included in the final output. peaks = [] for i in range(len(fragList)): fragMz = fragList[i][1] fragInt = fragList[i][0] peaks.append((fragMz, fragInt, (0, i, seq))) peaks.append((smf.calculate_heavy_mz(fragList[i][2], fragMz, 1), fragInt, (1, i, seq))) peaks.sort(key=lambda x: x[0]) lib[scanDict[key]] += peaks return lib
def getIons(sequence, charge): ''' This function return theoretical mass to charge of b and y ions from MS2. Based on this, comparing with observed data to list down potential ions to predict peptide sequence ''' outcome = [] bions = function.bIon(sequence) yions = function.yIon(sequence) for i in bions: outcome.append(i) for j in range(1, charge): outcome.append(float(mass.fast_mass(i, ion_type='b', charge=j))) for i in yions: outcome.append(i) for j in range(1, charge): outcome.append(float(mass.fast_mass(i, ion_type='y', charge=j))) return outcome
def generateAminoAcidDeltaList(self, path_dir, length, starter_mass=0.0): path = path_dir + "exclusionListDelta" + "_" + str(length) + "_" + str( starter_mass) + ".csv" with open(path, "a") as csvfile: writr = csv.writer(csvfile, lineterminator=os.linesep) writr.writerow(("mz", "comment", "position")) for i in generatePeptides(length): for j in i: mass_pep = mass.fast_mass(j, charge=0, ion_type='b') writr.writerow((mass_pep, "", "absolute"))
def fastmass(pep, ion_type, charge, mod=None, cam=True): base = mass.fast_mass(pep, ion_type=ion_type, charge=charge) if cam: base += 57.021 * pep.count('C') / charge if not mod is None: base += 15.995 * np.sum(mod == 1) / charge base += -np.sum(mod[mod < 0]) return base
def bIon_db(sequence, charge): ''' Creat database for mass of b ions to compare with observed data ''' b = function.bIon(sequence) b_db = dict() for j in range(1, charge): for i in b: b_db[float(mass.fast_mass( i, ion_type='b', charge=j))] = i # for j in range(1,charge)] return b_db
def yIon_db(sequence, charge): ''' Creat database for mass of y ions to compare with observed data ''' y = function.yIon(sequence) y_db = dict() for j in range(1, charge): for i in y: y_db[float(mass.fast_mass( i, ion_type='y', charge=j))] = i # for j in range(1,charge)] return y_db
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]): amino_acid_list = get_peptide_modification_list_inspect_format(peptide) #print(amino_acid_list) only_letters_list = [letter for letter in peptide if letter.isalpha()] only_mods_mass_add_list = [] for amino_acid in amino_acid_list: mod_mass_to_add = 0.0 mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid)) for mod_tokenized in mod_strings_tokenized: mod_mass_to_add += float(mod_tokenized) only_mods_mass_add_list.append(mod_mass_to_add) ion_to_mass_mapping = {} #print(peptide) #print(only_mods_mass_add_list) for charge in charge_set: for ion_type in ion_type_list: #print(ion_type) iso_topic_added_mass = 0.0 real_ion_type = ion_type if ion_type[-4:] == "-iso": iso_topic_added_mass = 1.007276 / float(charge) real_ion_type = ion_type[:-4] for i in range(len(amino_acid_list)): peak_mass = 0.0 if real_ion_type[0] in "abc": peak_annotation = ion_type + ":" + str(i+1) + ":" + str(charge) peak_mass = mass.fast_mass("".join(only_letters_list[:i+1]), ion_type=real_ion_type, charge=charge) + sum(only_mods_mass_add_list[:i+1])/(float(charge)) + iso_topic_added_mass #print(ion_type, i, charge, peak_mass, real_ion_type) else: peak_annotation = ion_type + ":" + str(len(amino_acid_list) - i) + ":" + str(charge) peak_mass = mass.fast_mass("".join(only_letters_list[i:]), ion_type=real_ion_type, charge=charge) + sum(only_mods_mass_add_list[i:])/(float(charge)) + iso_topic_added_mass #print(ion_type, i, charge, peak_mass) ion_to_mass_mapping[peak_annotation] = peak_mass return ion_to_mass_mapping
def return_frag_mzs(peptide, z): mzValues = [] digPat = r'\+\d+\.\d+' digs = re.findall(digPat, peptide) pepFrags = re.split(digPat, peptide) modValues = {} seq = '' while len(digs) != 0: dig = digs.pop(0) frag = pepFrags.pop(0) seq += frag modValues[len(seq)] = float(dig[1:]) / z seq += pepFrags[0] for i in range(1, len(seq) - 1): mz = mass.fast_mass(sequence=seq[i:], ion_type='y', charge=z) mz += sum([modValues[x] for x in modValues if x > i]) mzValues.append(mz) for i in range(len(seq) - 1, 1, -1): mz = mass.fast_mass(sequence=seq[:i], ion_type='b', charge=z) mz += sum([modValues[x] for x in modValues if x <= i]) mzValues.append(mz) return mzValues
def fragments(peptide, types, max_charge): '''The function generates all possible m/z for fragments of types and of charges from 1 to maxcharge.''' frags = [] for i in xrange(1, len(peptide) - 1): for ion_type in types: for charge in xrange(1, max_charge + 1): sub_pep = peptide[:i] if ion_type[0] in 'abc' else peptide[i:] frags.append((mass.fast_mass(sub_pep, ion_type=ion_type, charge=charge), ion_type, charge)) return frags
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]): amino_acid_list = get_peptide_modification_list_inspect_format(peptide) only_letters_list = [letter for letter in peptide if letter.isalpha()] only_mods_mass_add_list = [] for amino_acid in amino_acid_list: mod_mass_to_add = 0.0 mod_strings_tokenized = re.findall('[+-][1-9]*', re.sub("[A-Z]", "", amino_acid)) for mod_tokenized in mod_strings_tokenized: mod_mass_to_add += float(mod_tokenized) only_mods_mass_add_list.append(mod_mass_to_add) ion_to_mass_mapping = {} for charge in charge_set: for ion_type in ion_type_list: for i in range(len(amino_acid_list)): peak_mass = 0.0 if ion_type in "abc": peak_annotation = ion_type + ":" + str(i + 1) + ":" + str( charge) peak_mass = mass.fast_mass( "".join(only_letters_list[:i + 1]), ion_type=ion_type, charge=charge) + sum(only_mods_mass_add_list[:i + 1]) else: peak_annotation = ion_type + ":" + str( len(amino_acid_list) - i) + ":" + str(charge) peak_mass = mass.fast_mass("".join(only_letters_list[i:]), ion_type=ion_type, charge=charge) + sum( only_mods_mass_add_list[i:]) ion_to_mass_mapping[peak_annotation] = peak_mass return ion_to_mass_mapping
def calculate_theoretical_peptide_mass(peptide_sequence, charge): amino_acid_list = get_peptide_modification_list_inspect_format(peptide_sequence) only_letters_list = [letter for letter in peptide_sequence if letter.isalpha()] only_mods_mass_add_list = [] for amino_acid in amino_acid_list: mod_mass_to_add = 0.0 mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid)) for mod_tokenized in mod_strings_tokenized: mod_mass_to_add += float(mod_tokenized) only_mods_mass_add_list.append(mod_mass_to_add) total_peptide_mass = (mass.fast_mass("".join(only_letters_list), charge=charge) + sum(only_mods_mass_add_list)/(float(charge))) return total_peptide_mass
def get_mass(peptide, mass_dic={}, fixed={"C": 57.021464}): """ Compute mass of a peptide either from the sequence or from a dictionary look-ip """ if peptide in mass_dic: pep_mass = mass_dic[peptide] else: #add modification masses add = 0 for fixed_mod in fixed: add = peptide.count(fixed_mod) * fixed[fixed_mod] #compute pepmass pep_mass = mass.fast_mass(peptide) + add mass_dic[peptide] = pep_mass return (pep_mass)
def __init__(self, sequence, settings, pcharge=0, evalue=0, note='unknown', mass_exp=0, modifications=[], modification_list={}, custom_aa_mass=None, sumI=0, mc=None, infile='unknown', frag_mt=None, tags=None): self.sequence = sequence self.modified_sequence = sequence self.modification_out_str = '' self.modification_list = modification_list self.pcharge = int(pcharge) self.aa_mass = custom_aa_mass self.pmass = float(mass.fast_mass(sequence=self.sequence, charge=0)) - 18.0105646837 + settings.getfloat('modifications', 'protein nterm cleavage') + settings.getfloat('modifications', 'protein cterm cleavage') for modif in modifications: self.pmass += modif['mass'] if modif['position'] not in [0, len(self.sequence) + 1]: aminoacid = self.sequence[modif['position'] - 1] self.pmass -= mass.std_aa_mass[aminoacid] else: if modif['position'] == 0: self.pmass -= settings.getfloat('modifications', 'protein nterm cleavage') else: self.pmass -= settings.getfloat('modifications', 'protein cterm cleavage') self.mz = (mass_exp + pcharge * 1.007276) / pcharge self.modified_peptide(modifications) # self.RT_exp = RT_exp # self.RT_predicted = False self.evalue = float(evalue) #self.parentproteins = [] self.massdiff = float(mass_exp) - float(self.pmass) self.num_missed_cleavages = dict() self.mc = mc self.note = note self.note2 = '' self.note3 = '' self.protscore2 = 1 self.peptscore = 1 self.peptscore2 = 1 self.spectrum_mz = None self.fragment_mt = frag_mt self.sumI = sumI# / self.pcharge self.it = 1.0 self.infile = infile self.fragments = defaultdict(dict) self.valid_sequence = dict() self.tags = tags if len(tags) else None
def calculate_theoretical_peptide_mass(peptide_sequence, charge): amino_acid_list = get_peptide_modification_list_inspect_format( peptide_sequence) only_letters_list = [ letter for letter in peptide_sequence if letter.isalpha() ] only_mods_mass_add_list = [] for amino_acid in amino_acid_list: mod_mass_to_add = 0.0 mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid)) for mod_tokenized in mod_strings_tokenized: mod_mass_to_add += float(mod_tokenized) only_mods_mass_add_list.append(mod_mass_to_add) total_peptide_mass = ( mass.fast_mass("".join(only_letters_list), charge=charge) + sum(only_mods_mass_add_list) / (float(charge))) return total_peptide_mass
def generateMascotIons(length, starter_mass): """ all mascotions used for scoring """ water_mass = 2.0 * Nist_mass('H') + Nist_mass('O') amin_mass = 3.0 * Nist_mass('H') + Nist_mass('N') for generatorPeptideCombinations in generatePeptides(length): for peptides in generatorPeptideCombinations: for ion_type in ('b', 'y'): ion_type_1 = mass.fast_mass(sequence=peptides, charge=1, ion_type=ion_type) + starter_mass ion_type_1_star = ion_type_1 - amin_mass ion_type_1_o = ion_type_1 - water_mass ion_type_2 = calculateDoubleCharged(ion_type_1) ion_type_2_star = calculateDoubleCharged(ion_type_1_star) ion_type_2_o = calculateDoubleCharged(ion_type_1_o) yield ([ ion_type_1, "".join(peptides) + "_ion_" + str(ion_type) + "_1" ], [ ion_type_1_star, "".join(peptides) + "_ion_" + str(ion_type) + "_1_star" ], [ ion_type_1_o, "".join(peptides) + "_ion_" + str(ion_type) + "_1_o" ], [ ion_type_2, "".join(peptides) + "_ion_" + str(ion_type) + "_2" ], [ ion_type_2_star, "".join(peptides) + "_ion_" + str(ion_type) + "_2_star" ], [ ion_type_2_o, "".join(peptides) + "_ion_" + str(ion_type) + "_2_o" ])
def transform_sequence_to_masssequence(sequence, mods): """ Amino acids sequence to masssequence Parameters ---------- sequence: str Sequence of a peptide mods: list Modifications of the peptide Returns ------- list masses of indices """ mass_sequence = [] index = 0 for i in sequence: mass_sequence.append(mass.fast_mass(i) + mods[index]) index += 1 return mass_sequence
def add_mass(self): self.data_frame['mass'] = self.data_frame['sequence'].apply( lambda sequence: mass.fast_mass(sequence.replace('X', '')))
def handcrafted_features(data, tags): # # DOI 10.1007/s00251-017-1023-5 # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain # Modified to handle split for training, validation, and test cohorts # Modified for multinomial classification # # physicochemical amino acid properties basicity = { 'A': 206.4, 'B': 210.7, 'C': 206.2, 'D': 208.6, 'E': 215.6, 'F': 212.1, 'G': 202.7, 'H': 223.7, 'I': 210.8, 'K': 221.8, 'L': 209.6, 'M': 213.3, 'N': 212.8, 'P': 214.4, 'Q': 214.2, 'R': 237.0, 'S': 207.6, 'T': 211.7, 'V': 208.7, 'W': 216.1, 'X': 210.2, 'Y': 213.1, 'Z': 214.9 } hydrophobicity = { 'A': 0.16, 'B': -3.14, 'C': 2.50, 'D': -2.49, 'E': -1.50, 'F': 5.00, 'G': -3.31, 'H': -4.63, 'I': 4.41, 'K': -5.00, 'L': 4.76, 'M': 3.23, 'N': -3.79, 'P': -4.92, 'Q': -2.76, 'R': -2.77, 'S': -2.85, 'T': -1.08, 'V': 3.02, 'W': 4.88, 'X': 4.59, 'Y': 2.00, 'Z': -2.13 } helicity = { 'A': 1.24, 'B': 0.92, 'C': 0.79, 'D': 0.89, 'E': 0.85, 'F': 1.26, 'G': 1.15, 'H': 0.97, 'I': 1.29, 'K': 0.88, 'L': 1.28, 'M': 1.22, 'N': 0.94, 'P': 0.57, 'Q': 0.96, 'R': 0.95, 'S': 1.00, 'T': 1.09, 'V': 1.27, 'W': 1.07, 'X': 1.29, 'Y': 1.11, 'Z': 0.91 } mutation_stability = { 'A': 13, 'C': 52, 'D': 11, 'E': 12, 'F': 32, 'G': 27, 'H': 15, 'I': 10, 'K': 24, 'L': 34, 'M': 6, 'N': 6, 'P': 20, 'Q': 10, 'R': 17, 'S': 10, 'T': 11, 'V': 17, 'W': 55, 'Y': 31 } # feature conversion and generation features_list = [] for chain in ['tra', 'trb']: onehot_encoder = feature_extraction.DictVectorizer(sparse=False) features_list.append( pd.DataFrame(onehot_encoder.fit_transform( data[[chain + '_vgene', chain + '_jgene']].to_dict(orient='records')), columns=onehot_encoder.feature_names_)) # sequence length features_list.append(data[chain + '_cdr3'].apply( lambda sequence: parser.length(sequence)).to_frame().rename( columns={chain + '_cdr3': 'length'})) # number of occurences of each amino acid aa_counts = pd.DataFrame.from_records([ parser.amino_acid_composition(sequence) for sequence in data[chain + '_cdr3'] ]).fillna(0) aa_counts.columns = [ chain + '_count_{}'.format(column) for column in aa_counts.columns ] features_list.append(aa_counts) # physicochemical properties: (average) basicity, (average) hydrophobicity, # (average) helicity, pI, (average) mutation stability features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_basicity'})) features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum( [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame( ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'})) features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_helicity'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: electrochem.pI(seq)).to_frame().rename( columns={chain + '_cdr3': 'pI'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser. length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_mutation_stability'})) # peptide mass features_list.append(data[chain + '_cdr3'].apply( lambda seq: mass.fast_mass(seq)).to_frame().rename( columns={chain + '_cdr3': 'mass'})) # positional features # amino acid occurence and physicochemical properties at a given position from the center pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [ [] for _ in range(6) ] for sequence in data[chain + '_cdr3']: length = parser.length(sequence) start_pos = -1 * (length // 2) pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\ list(range(start_pos, 0)) + list(range(1, start_pos + length + 1)) pos_aa.append({ chain + '_pos_{}_{}'.format(pos, aa): 1 for pos, aa in zip(pos_range, sequence) }) pos_basicity.append({ chain + '_pos_{}_basicity'.format(pos): basicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_hydro.append({ chain + '_pos_{}_hydrophobicity'.format(pos): hydrophobicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_helicity.append({ chain + '_pos_{}_helicity'.format(pos): helicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_pI.append({ chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa) for pos, aa in zip(pos_range, sequence) }) pos_mutation.append({ chain + '_pos_{}_mutation_stability'.format(pos): mutation_stability[aa] for pos, aa in zip(pos_range, sequence) }) features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0)) features_list.append(data['weights']) for tag in tags: features_list.append(data['labels_' + tag]) features_list.append(data['split']) # combine all features data_processed = pd.concat(features_list, axis=1) return data_processed
def generate_fragments_from_peptide(self, peptide, ion_types, label_format={}, min_charge=1, max_charge=1, aa_mass_dict=None, polarity="+", ion_composition=None, modification_dict={}, verbose=False): tstart = ttime() # specify charges if min_charge < 1: min_charge = 1 if max_charge < min_charge: max_charge, min_charge = min_charge, max_charge # # update ion composition obj include_modifications = False if len(modification_dict) > 0: include_modifications = True # self.add_modification_composition(modification_dict) # determine ion composition if ion_composition is None: ion_composition = self.ion_composition # make backup of peptide sequence _peptide = peptide # check if shortcuts were used ion_types = self.replace_ion_composition_shortcut(ion_types) fragment_dict = {} for ion_type in ion_types: if ion_type in [self._M_all_]: peptide = _peptide for charge in xrange(min_charge, max_charge + 1): ion_mz = mass.fast_mass(peptide, ion_type=ion_type, charge=charge, ion_comp=ion_composition) ion_label = "{}{}{}".format(ion_type, polarity, charge) fragment_dict[ion_label] = { 'mz': ion_mz, 'z': charge, 'seq': peptide } if ion_type in self._all_abc_all_: peptide = _peptide if not self.check_peptide_rules(ion_type, peptide): continue for i in xrange(1, len(peptide)): peptide_seq = peptide[:i] if not self.check_peptide_rules(ion_type, peptide_seq): continue mod_peptide_seq, modification_mass = peptide_seq, 0 if include_modifications: mod_peptide_seq, modification_mass = self.check_modification( i, peptide_seq, modification_dict) for charge in xrange(min_charge, max_charge + 1): ion_mz = mass.fast_mass(peptide_seq, ion_type=ion_type, charge=charge, ion_comp=ion_composition) ion_mz = ion_mz + (modification_mass / charge) ion_label, ion_label_full = self.generate_label( ion_type[0], i, polarity, charge, mod_peptide_seq) fragment_dict[ion_label_full] = { 'mz': ion_mz, 'z': charge, 'seq': peptide_seq, 'full_label': ion_label_full, 'label': ion_label } if ion_type in self._all_xyz_all_: peptide = _peptide #[::-1] if not self.check_peptide_rules(ion_type, peptide): continue # generate list of inverse fragment numbers _frag_label_length = np.arange(len(peptide), 0, -1) # iterate over peptide length for i in xrange(1, len(peptide)): peptide_seq = peptide[i:] if not self.check_peptide_rules(ion_type, peptide_seq): continue mod_peptide_seq, modification_mass = peptide_seq, 0 if include_modifications: mod_peptide_seq, modification_mass = self.check_modification( i + 1, peptide_seq, modification_dict) for charge in xrange(min_charge, max_charge + 1): ion_mz = mass.fast_mass(peptide_seq, ion_type=ion_type, charge=charge, ion_comp=ion_composition) # modify ion mass with modification mass ion_mz = ion_mz + (modification_mass / charge) # generate label ion_label, ion_label_full = self.generate_label( ion_type[0], _frag_label_length[i], polarity, charge, mod_peptide_seq, full_ion_type=ion_type) fragment_dict[ion_label_full] = { 'mz': ion_mz, 'z': charge, 'seq': mod_peptide_seq, 'full_label': ion_label_full, 'label': ion_label } # print verbose information if verbose: msg = "Peptide length: {} | # Fragments: {} | Time to generate: {:.4f}".format( len(peptide), len(fragment_dict), ttime() - tstart) print(msg) return fragment_dict
def get_peptide_results(resfile, mgfDataArray, options): ''' Retrieve peptide assignments and PTM specifications from mascot .dat file, Return values: 1) list of mascot_hit objects 2) list of varMod objects ''' # get file header data params = resfile.params() # get mgf rt vector mgfRTs = mgfDataArray[:, 0] try: fixed_mods = params.getMODS() except: pass # build list of variable modifications and associated mass offsets: var_mods = [] i = 1 while params.getVarModsName(i): modName = params.getVarModsName(i) modDelta = params.getVarModsDelta(i) modNeutralLoss = params.getVarModsNeutralLoss(i) modIndex = i var_mods.append(varMod(modIndex, modName, modDelta, modNeutralLoss)) i += 1 if options.printVarMods: for i in var_mods: print i sys.exit() (scriptName, flags, minProbability, maxHitsToReport, ignoreIonsScoreBelow, minPepLenInPepSummary, usePeptideSummary, flags2) = resfile.get_ms_mascotresults_params(msparser.ms_mascotoptions()) results = msparser.ms_peptidesummary(resfile, flags, 1, 999999999, '', ignoreIonsScoreBelow, minPepLenInPepSummary, '', flags2) #results = msparser.ms_peptidesummary(resfile) mascot_hits = [] if usePeptideSummary: pepsum = msparser.ms_peptidesummary( resfile, # results file object flags, # MSRES_group_proteins 1, # minProbability 999999999, # maxHits '', # unigeneIndexFile ignoreIonsScoreBelow, # ignore hits below minPepLenInPepSummary, # minPepLenINPepSummary '', # singleHit flags2) # flags2 total_index = 0 for x in xrange(1, 10000000): prot = pepsum.getHit(x) # indes, prot_acc, prot_index, prot_matches, varmods if prot is not None: #print('results for protein hit %x' %x) num_peps = prot.getNumPeptides() prot_acc = prot.getAccession() prot_index = x for i in range(1, num_peps + 1): query = prot.getPeptideQuery(i) p = prot.getPeptideP(i) pep = pepsum.getPeptide(query, p) #intensity = resfile.getObservedIntensity(query) if pep.getAnyMatch( ): # not sure what this does ---> returns a boolean if any peptide is assigned to this query query = pep.getQuery() # returns index of query queryData = msparser.ms_inputquery(resfile, query) rank = pep.getRank() charge = pep.getCharge() mz = pep.getObserved() seq = pep.getPeptideStr() seq_len = pep.getPeptideLength() score = pep.getIonsScore() #intensity = pep.getTotalIonsIntensity() mod_string = pep.getVarModsStr() prot_matches = pep.getProteins() rt = queryData.getRetentionTimes() miss = pep.getMissedCleavages() identity = results.getPeptideIdentityThreshold(query, 20) homology = results.getHomologyThreshold(query, 20) pep_score = pep.getIonsScore() # TODO: connect UI threshold setting to this conditional if float(score) < float(identity): continue # get 2+ peptides with 1 cysteine if seq.count('C') != 1 or int(charge) != 2: continue # exclude missed cleavages and terminal peptides if seq.count('R') + seq.count('K') != 1: continue # need to count occurrances of IAA/C - make sure CYS is modified w IAA if mod_string.count(str(options.targetMod)) != 1: continue index = np.argmin(np.absolute(mgfRTs - float(rt))) if np.shape < 1: print 'Warning, no MGF intensity found for entry: mz: %s, rt: %s' % ( mz, rt) assert float(rt) - float(mgfDataArray[index][0]) < 0.001 intensity = mgfDataArray[index][2] if intensity < options.minIntensity: continue hit = mascotHit(float(mz), charge, float(rt), miss, score, seq, mod_string, query, rank, total_index, prot_index, prot_acc, prot_matches, identity, homology, pep_score) hit.exptFragments = [] # intensity = 0 num_peaks = queryData.getNumberOfPeaks(1) for j in range(1, 1 + num_peaks): peak = [ queryData.getPeakMass(1, j), queryData.getPeakIntensity(1, j) ] hit.exptFragments.append(peak) # intensity += peak[1] hit.intensity = intensity hit.sequence_mass = float(mass.fast_mass(seq, charge=2)) hit.index = total_index mascot_hits.append(hit) total_index += 1 else: break return mascot_hits, var_mods, total_index
def get_fragments(peptide, mod_string, var_mods, types=('b', 'y'), maxcharge=2): ''' Generate theoretical sequence ions for a given peptide sequence string - types argumnet to be replaced by user specified match ions - maxcharge to be replaced by user specified value Return Value: - List of peptideFragment objects - object for each potential modification site - peptideFragment attributes are: self.residue = residue self.residue_index = residue_index self.CRM_mass = CRM_mass self.correlation_score = correlation_score self.a = a self.b = b self.c = c self.x = x self.y = y self.z = z NB: a,b,c,x,y,z are nested lists - each sublist has the structure ['FRAGMENT SEQUENCE', m/z of fragment] NOTE: len(mod_string) == len(peptide) + 2. The two extra entries define modifications at the N and C termini of the pepitde ---> for development purposes, remove these ''' mod_string = mod_string[1:len(mod_string) - 1] #mod_mass = float(ht_hit.mz) - float(peptide.mz) *2 # need to be calculated in the rolling mod function to account for the presence of native pep mods assert len(peptide) == len(mod_string) # get the mod string for residues in this fragment frag_mod_str = list(mod_string) # calculate mass of unmodified peptide calc_pep_mz = mass.fast_mass(peptide, charge=2) # create a list of masses to add/subtract from each residue frag_mod_mass = get_fragment_mod_masses(frag_mod_str, var_mods) pepFrags = PeptideFragments() a, b, c, x, y, z = [], [], [], [], [], [] # generate fragment ions and apply mods (native or CRM) as needed for i in xrange(1, len(peptide)): for ion_type in types: for charge in xrange(1, maxcharge): if ion_type in 'abc': # generate pure sequence of fragment frag = peptide[:i] # get mass of relevant mods mods = frag_mod_mass[:i] # calculate mass of base fragment mz = mass.fast_mass(peptide[:i], ion_type=ion_type, charge=charge) # add total mass of modifications - including CRM mz = mz + sum(mods) / charge b.append([peptide[:i], float(mz), ion_type, charge]) if ion_type in 'xyz': # generate pure sequence of fragment frag = peptide[i:] # get mass of relevant mods mods = frag_mod_mass[i:] # calculate mass of base fragment mz = mass.fast_mass(peptide[i:], ion_type=ion_type, charge=charge) # add total mass of modifications - including CRM mz = mz + sum(mods) y.append([peptide[i:], float(mz), ion_type, charge]) if len(a) != 0: pepFrags.a = a if len(b) != 0: pepFrags.b = b if len(c) != 0: pepFrags.c = c if len(x) != 0: pepFrags.x = x if len(y) != 0: pepFrags.y = y if len(z) != 0: pepFrags.z = z return pepFrags
def fastmass(pep, ion_type, charge): return mass.fast_mass(pep, ion_type=ion_type, charge=charge) + 57.021 * pep.count('C') / charge
def fragments_b(peptide, maxcharge=1): for i in xrange(1, len(peptide) + 1): #changed to catch ending aas yield mass.fast_mass(peptide[:i], ion_type='b', charge=maxcharge)
def getNativeMass(self): return mass.fast_mass(sequence=self.sequence)
def test_fast_mass(self): for pep in self.random_peptides: self.assertAlmostEqual( mass.fast_mass(pep, aa_mass=self.test_aa_mass), sum(pep.count(aa) * m for aa, m in self.test_aa_mass.items()) + self.mass_H * 2.0 + self.mass_O)
def test_aa_mass(self): h2o = mass.calculate_mass(formula='H2O') for aa, m in mass.std_aa_mass.items(): self.assertEqual(m + h2o, mass.fast_mass(aa))
def fragments_y(peptide, maxcharge=1): for i in xrange(0, len(peptide)): #changed to catch ending aas yield mass.fast_mass(peptide[i:], ion_type='y', charge=maxcharge)