def Model_2(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']]
    X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']]
    Y_train = train[' Label']

    X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test)
    X_train,Y_train = shuffle(X_train,Y_train,random_state = 3)

    # Training
    param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree',
            'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2}

    xgb = XGBClassifier( **param, random_state = 3)
    clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    Y_pred = clf.predict(X_test)

    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_2.csv", index = False)
    result["Label"] = Y_pred
    result.to_csv("Prediction_2.csv", index = False)
Exemplo n.º 2
0
 def test_charge_calculations_dict(self):
     self.assertRaises(PyteomicsError, charge, {'H-': 1, '-OH': 1, 'E': 1},
                       7, pK_nterm={'H-': {'A': [(9., 1)]}})
     self.assertTrue(
             abs(charge({'A': 3, 'H-': 1, '-OH': 1}, 14.0) + 1.0) < 0.01)
     self.assertTrue(
             abs(charge({'A': 1, 'H-': 1, '-OH': 1, 'ntermB': 1, 'ctermA': 1},
                 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                 pK_nterm={'H-': {'A': [(3., 1)], 'B': [(3., 1)]}}) + 1.0)
             < 0.01)
     self.assertRaises(PyteomicsError, charge,
             {'A': 1, 'H-': 1, '-OH': 1, 'ctermA': 1}, 14.0,
                 pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                 pK_nterm={'H-': {'A': [(3., 1)]}})
     self.assertRaises(PyteomicsError, charge,
             {'A': 1, 'H-': 1, '-OH': 1, 'ntermA': 1}, 14.0,
                 pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                 pK_nterm={'H-': {'A': [(3., 1)]}})
     self.assertRaises(PyteomicsError, charge,
             {'A': 1, 'H-': 1, '-OH': 1, 'ntermA': 2, 'ctermA': 1}, 14.0,
                 pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                 pK_nterm={'H-': {'A': [(3., 1)]}})
     self.assertRaises(PyteomicsError, charge,
             {'A': 1, 'H-': 1, 'ntermA': 1, 'ctermA': 1}, 14.0,
                 pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                 pK_nterm={'H-': {'A': [(3., 1)]}})
Exemplo n.º 3
0
 def test_charge_input(self):
     for i in range(0, 14):
         self.assertAlmostEqual(
             charge('H-ACDEFGH-OH', i),
             charge(['H-', 'A', 'C', 'D', 'E', 'F', 'G', 'H', '-OH'], i))
     for i in range(0, 14):
         self.assertAlmostEqual(
             charge('H-ACDEFGH-OH', i),
             charge({'H-': 1, 'A': 1, 'C': 1, 'D': 1,
                     'E': 1, 'F': 1, 'G': 1, 'H': 1, '-OH': 1}, i))
Exemplo n.º 4
0
 def test_charge_calculations_list(self):
     self.assertRaises(PyteomicsError,
         charge, ['A','A','A'], 5.0,
         pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
         pK_nterm={'H-': {'A': [(3., 1)]}})
     self.assertTrue(
         abs(charge(['H-','A','A','A','-OH'], 0.0) - 1.0) < 0.01)
     self.assertTrue(
         abs(charge(['H-','A','A','A','-OH'], 14.0) + 1.0) < 0.01)
     self.assertTrue(
         abs(charge(['H-','A','A','A','-OH'], (2.34 + 9.69) / 2.0)) < 0.01)
Exemplo n.º 5
0
 def test_charge_calculations_str(self):
     self.assertTrue(
         abs(charge('AAA', 5.0,
                    pK={'H-': [(9., 1)], '-OH': [(8., -1)]},
                    pK_nterm={'H-': {'A': [(3., 1)]}})) < 0.01)
     self.assertTrue(
         abs(charge('H-AAA-OH', 0.0) - 1.0) < 0.01)
     self.assertTrue(
         abs(charge('H-AAA-OH', 14.0) + 1.0) < 0.01)
     self.assertTrue(
         abs(charge('H-AAA-OH', (2.34 + 9.69) / 2.0)) < 0.01)
Exemplo n.º 6
0
def get_theor_spectrum(peptide,
                       acc_frag,
                       types=('b', 'y'),
                       maxcharge=None,
                       **kwargs):
    """
    Calculates theoretical spectra in two ways: usual one. and formatter in integer (mz / frag_acc).
    `peptide` -peptide sequence
    `acc_frag` - accuracy of matching.
    `types` - ion types.
    `maxcharge` - maximum charge.

    ----------
    Returns spectra in two ways (usual, integer)
    """
    peaks = {}
    theoretical_set = defaultdict(set)
    pl = len(peptide) - 1
    if not maxcharge:
        maxcharge = 1 + int(ec.charge(peptide, pH=2))
    for charge in range(1, maxcharge + 1):
        for ion_type in types:
            nterminal = ion_type[0] in 'abc'
            if nterminal:
                maxpart = peptide[:-1]
                maxmass = cmass.fast_mass(maxpart,
                                          ion_type=ion_type,
                                          charge=charge,
                                          **kwargs)
                marr = np.zeros((pl, ), dtype=float)
                marr[0] = maxmass
                for i in range(1, pl):
                    marr[i] = marr[i - 1] - mass.fast_mass2(
                        [maxpart[-i]]) / charge  ### recalculate
            else:
                maxpart = peptide[1:]
                maxmass = cmass.fast_mass(maxpart,
                                          ion_type=ion_type,
                                          charge=charge,
                                          **kwargs)
                marr = np.zeros((pl, ), dtype=float)
                marr[pl - 1] = maxmass
                for i in range(pl - 2, -1, -1):
                    marr[i] = marr[i + 1] - mass.fast_mass2(
                        [maxpart[-(i + 2)]]) / charge  ### recalculate

            tmp = marr / acc_frag
            tmp = tmp.astype(int)
            theoretical_set[ion_type].update(tmp)
            marr.sort()
            peaks[ion_type, charge] = marr
    return peaks, theoretical_set
def Model_1(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']]
    X_test = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']]
    Y_train = train[' Label']

    # Training
    clf = BaggingClassifier(base_estimator =  RandomForestClassifier(random_state = 2), n_estimators = 100, random_state = 2, n_jobs = -1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    Y_pred = clf.predict(X_test)

    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_1.csv", index = False)
    result["Label"] = Y_pred
    result.to_csv("Prediction_1.csv", index = False)
Exemplo n.º 8
0
peptides = [{'sequence': i} for i in unique_peptides]

print 'Parsing peptide sequences...'
for peptide in peptides:
    peptide['parsed_sequence'] = parser.parse(peptide['sequence'],
                                              show_unmodified_termini=True)
    peptide['length'] = parser.length(peptide['parsed_sequence'])
print 'Done!'

peptides = [peptide for peptide in peptides if peptide['length'] <= 100]

print 'Calculating the mass, charge and m/z...'
for peptide in peptides:
    peptide['charge'] = int(
        round(electrochem.charge(peptide['parsed_sequence'], pH=2.0)))
    peptide['mass'] = mass.calculate_mass(peptide['parsed_sequence'])
    peptide['m/z'] = mass.calculate_mass(peptide['parsed_sequence'],
                                         charge=peptide['charge'])
print 'Done!'

print 'Calculating the retention time...'
for peptide in peptides:
    peptide['RT_RP'] = achrom.calculate_RT(peptide['parsed_sequence'],
                                           achrom.RCs_zubarev)
    peptide['RT_normal'] = achrom.calculate_RT(peptide['parsed_sequence'],
                                               achrom.RCs_yoshida_lc)
print 'Done!'

plt.figure()
plt.hist([peptide['m/z'] for peptide in peptides], bins=2000, range=(0, 4000))
aa_comp = dict(mass.std_aa_comp)
aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
aa_comp['ox'] = mass.Composition({'O':1})

# Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias.

print('Calculating peptide physicochemical properties...')
iso_electric_points = []
pep_charges = []
pep_mass = []
i = 0

for peptide in mod_pep:
    peptide_isoelectric_point = electrochem.pI(peptide)
    peptide_charge = electrochem.charge(peptide, 7)
    peptide_mass = mass.calculate_mass(sequence = peptide, aa_comp = aa_comp)
    pep_charges.append(peptide_charge)
    iso_electric_points.append(peptide_isoelectric_point)
    pep_mass.append(peptide_mass)
    i += 1

print('LC-retention time prediction with the following parameters:')

print(lc_params)

# Column length:
column_length = lc_params['column_length'][0]
if isinstance(column_length, numbers.Number) != True:
    raise NameError('Error in parameter input file, column_length takes only Numeric.')
Exemplo n.º 10
0
import pylab
import numpy as np
from pyteomics import electrochem

pHs = np.arange(1, 14, 0.5) # list of values of pH
charges = electrochem.charge('PEPTIDE', pHs) # charge function accepts lists of pHs

pylab.figure()
pylab.plot(pHs, charges)
pylab.title("Charge of peptide 'PEPTIDE' vs pH")
pylab.xlabel('pH')
pylab.ylabel('Charge')
pylab.show()
Exemplo n.º 11
0
The hyperparameters of SVM model was tuned using GridSearchCV.
The model was tested by using 5-fold cross validation.
'''
import pandas as pd
from sklearn import svm
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix,make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from pyteomics import electrochem,mass,parser

# Training Set Data
Train = pd.read_csv("train.csv")
Train_AAC = pd.read_csv("AAC_train.csv")
Train_DP2 = pd.read_csv("DP2F_train.csv")

Train_C = [electrochem.charge(x,len(x)) for x in Train["Sequence"]]
Train_M = [mass.calculate_mass(sequence=x)/len(x) for x in Train["Sequence"]]
Train_PI = [ProteinAnalysis(x).isoelectric_point() for x in Train["Sequence"]]

# Test Set Data
Test = pd.read_csv("test.csv")
Test_AAC = pd.read_csv("AAC_test.csv")
Test_DP2 = pd.read_csv("DP2F_test.csv")

Test_C = [electrochem.charge(x,len(x)) for x in Test["Sequence"]]
Test_M = [mass.calculate_mass(sequence=x)/len(x) for x in Test["Sequence"]]
Test_PI = [ProteinAnalysis(x).isoelectric_point() for x in Test["Sequence"]]

Labels = Train["Lable"]

# Assembing Parameters Of Training set
def peptide_mod_biolccc_rt_prediction(lc_params_file, fasta_file_name,
                                      custom_gradient, output_name):
    lc_params = pd.read_csv(lc_params_file)

    all_required_params = [
        'column_length', 'column_diameter', 'column_pore_size',
        'second_solvent_concentration_a', 'second_solvent_concentration_b',
        'gradient_0', 'gradient_1', 'gradient_2', 'flow_rate', 'code_format',
        'linear', 'model'
    ]

    # Check if all parameters are in parameter input file.
    if sorted(all_required_params) != sorted(list(lc_params.keys())):
        raise NameError(
            'Error in parameter LC input file, check for typos or missing parameter.'
        )

    # TRUE OR FALSE statement about whether the fasta file is in codons or in amino acids
    # Currently there is not a method in place, to use a nucleotide sequence fasta file as an input.
    code_format = lc_params['code_format'][0]
    linear_gradient = lc_params['linear'][0]

    # if not a linear gradient, a gradient file must be supplied.
    if not linear_gradient:
        gradient_file = pd.read_csv(custom_gradient)

    # which type of model to use for prediction (from TFA or FA)
    model_type = lc_params['model'][0]

    if model_type == 'FA':
        print('formic acid')
    elif model_type == 'TFA':
        print('tri')

    # Initialize empty dictionary of contig names and sequences:
    seq_df = pd.DataFrame(columns=['contigs', 'seq'])

    # Initialize empty lists of sequences and contigs:
    seq_vec = []
    contig_vec = []

    # Initalize variable that will contain the name of each sequence:
    last_seq = None

    # Reading in fasta file
    fasta_in = open(fasta_file_name, 'r')

    for line in fasta_in:
        # Strip the line:
        line = line.strip()
        # If the line is blank, move on.
        if len(line) == 0:  # blank line
            continue
        # If the line is a header, record the header as last_seq
        elif line[0] == ">":  # header-line
            last_seq = line[1:]
        # If the line is a sequence, record the sequence:
        else:  # sequence line
            # separate if statements for if the fasta file was input as amino acids or as genes or as mrna. Note that code_format == 'genes' and code_format == 'rna' are not functional yet.
            if (code_format == 'genes'):
                aa_line = Codon_to_Aminoacid(line)
                cleaved_line = pyteomics.parser.cleave(
                    str(aa_line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            elif (code_format == 'rna'):
                removed_u = line.relace('U', 'T')
                aa_line = Codon_to_Aminoacid(removed_u)
                cleaved_line = pyteomics.parser.cleave(
                    str(aa_line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            elif (code_format == 'aas'):
                # Digest with trypsin:
                cleaved_line = pyteomics.parser.cleave(
                    str(line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            # If the peptide is shorter than 5 amino acids long, then we remove it fromt the dataset:
            for tryp_pep in cleaved_line:
                if len(tryp_pep) < 5:
                    continue
                seq_vec.append(tryp_pep)
                contig_vec.append(last_seq)

    # Close the fasta file:
    fasta_in.close()

    print('Removing xs and *s from seqs...')

    contig_vec_pd = pd.Series(contig_vec, name='contig')
    # Adding in the modification terms for the termini:
    seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec]
    # Removing contigs with unknown amino acid (X) or selenocysteine (U):
    stars_removed_peps = []
    for starred_peptide in seq_vec_terms:
        line_new = starred_peptide
        if '*' in line_new:
            continue
    #some peptides have unknown amino acids, remove them.
        if 'X' in line_new:
            continue
        if 'U' in line_new:
            continue
        stars_removed_peps.append(line_new)

    # Changing B to asparagine
    b_removed_peps = []
    for b_peptide in stars_removed_peps:
        line_new = re.sub('B', 'N', b_peptide)
        b_removed_peps.append(line_new)

    # Changing Z to glutamine
    z_removed_peps = []
    for z_peptide in b_removed_peps:
        line_new = re.sub('Z', 'Q', z_peptide)
        z_removed_peps.append(line_new)

    # Removing contigs that have an unknown amino acid (X), or selenocysteine ('U')
    contig_vec_no_x = []
    for contig_name in range(len(contig_vec)):
        if 'X' in seq_vec_terms[contig_name]:
            continue
        if 'U' in seq_vec_terms[contig_name]:
            continue
        if '*' in seq_vec_terms[contig_name]:
            continue

        temp_contig = contig_vec[contig_name]
        contig_vec_no_x.append(temp_contig)

    # Modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream)

    print('Modifying peptides...')
    mod_pep = []
    for tryp_pep in z_removed_peps:
        test_iso = pyteomics.parser.isoforms(tryp_pep,
                                             fixed_mods={
                                                 'ox': ['M'],
                                                 'cam': ['C']
                                             },
                                             show_unmodified_termini=True)
        for blah in test_iso:
            mod_pep.append(blah)

    # Modified amino acid dictionary for mass calculation:

    aa_comp = dict(mass.std_aa_comp)
    aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
    aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
    aa_comp['ox'] = mass.Composition({'O': 1})

    # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias.

    print('Calculating peptide physicochemical properties...')
    iso_electric_points = []
    pep_charges = []
    pep_mass = []
    i = 0

    for peptide in mod_pep:
        peptide_isoelectric_point = electrochem.pI(peptide)
        peptide_charge = electrochem.charge(peptide, 7)
        peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp)
        pep_charges.append(peptide_charge)
        iso_electric_points.append(peptide_isoelectric_point)
        pep_mass.append(peptide_mass)
        i += 1

    print('LC-retention time prediction with the following parameters:')

    print(lc_params)

    # Column length:
    column_length = lc_params['column_length'][0]
    if isinstance(column_length, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_length takes only Numeric.')

    # Column diameter:
    column_diameter = lc_params['column_diameter'][0]
    if isinstance(column_diameter, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_diameter takes only Numeric.'
        )

    # Column pore size
    column_pore_size = lc_params['column_pore_size'][0]  # 0.11 minutes
    if isinstance(column_pore_size, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_pore_size takes only Numeric.'
        )

    second_solvent_concentration_a = lc_params[
        'second_solvent_concentration_a'][0]
    if isinstance(second_solvent_concentration_a, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, second_solvent_concentration_a takes only Numeric.'
        )

    second_solvent_concentration_b = lc_params[
        'second_solvent_concentration_b'][0]
    if isinstance(second_solvent_concentration_b, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, second_solvent_concentration_b takes only Numeric.'
        )

    gradient_0 = lc_params['gradient_0'][0]
    if isinstance(gradient_0, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_0 takes only Numeric.')

    gradient_1 = lc_params['gradient_1'][0]
    if isinstance(gradient_1, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_1 takes only Numeric.')

    gradient_2 = lc_params['gradient_2'][0]
    if isinstance(gradient_2, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_2 takes only Numeric.')

    flow_rate = lc_params['flow_rate'][0]
    if isinstance(flow_rate, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, flow_rate takes only Numeric')

    # biolccc predicting RT times
    myChromoConditions = biolccc.ChromoConditions()

    # The column length in mm.
    myChromoConditions.setColumnLength(column_length)

    # The internal column diameter in mm.
    myChromoConditions.setColumnDiameter(column_diameter)

    # The average pore size in A.
    myChromoConditions.setColumnPoreSize(column_pore_size)

    # The concentration of the eluting solvent (ACN for the reversed
    # phase) in component A in %.
    myChromoConditions.setSecondSolventConcentrationA(
        second_solvent_concentration_a)

    # The concentration of the eluting solvent (ACN for the reversed
    # phase) in component B in %.
    myChromoConditions.setSecondSolventConcentrationB(
        second_solvent_concentration_b)

    # The shape of the gradient. The example is a linear gradient
    # from gradient_0% to gradient_1% of component B over gradient_2 minutes.

    if linear_gradient:
        myChromoConditions.setGradient(
            biolccc.Gradient(gradient_0, gradient_1, gradient_2))
    else:
        # loop that goes through and sets a custom gradient. another gradient file is required as the argv[4] file.
        myGradient = biolccc.Gradient()
        # An older version of this was more static, and left in the comments below to demonstrate what this loop is doing:
        for set_point in range(len(gradient_file.columns)):
            myGradient.addPoint(gradient_file.iloc[0, set_point],
                                gradient_file.iloc[1, set_point])
        myChromoConditions.setGradient(myGradient)

        # The following gradient is an exponential function increasing from gradient_0
        # to 100, specifically for the Aylward testing datasetself.
        # def exp_function(x):
        #   x1 = math.pow(x, 2)//100
        # this is the function used to compute these setpoints.
        #myGradient = biolccc.Gradient()
        #myGradient.addPoint(0.0, gradient_0)
        #myGradient.addPoint(15.0, 2.0)
        #myGradient.addPoint(30.0, 9.0)
        #myGradient.addPoint(45.0, 20.0)
        #myGradient.addPoint(60.0, 36.0)
        #myGradient.addPoint(75.0, 56.0)
        #myGradient.addPoint(90.0, 81.0)
        #myGradient.addPoint(gradient_2, gradient_1)
        #myChromoConditions.setGradient(myGradient)

    # The flow rate in ml/min.
    myChromoConditions.setFlowRate(flow_rate)

    print('Calculating retention times...')

    # Designating BioLCCC model to use:
    if model_type == 'TFA':
        model_to_use = biolccc.rpAcnTfaChain
    elif model_type == 'FA':
        model_to_use = biolccc.rpAcnFaRod

    peptide_rts = []
    i = 0

    print('Calculating retention times...')
    for tryp_pep in mod_pep:
        rt_temp = biolccc.calculateRT(tryp_pep, model_to_use,
                                      myChromoConditions)

        peptide_rts.append(rt_temp)
        i += 1

    # Combining the sequences, times, and physicochemical characteristics.
    peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence')
    peptide_rts = pd.Series(peptide_rts, name='rts')
    iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point')
    pep_charges_pd = pd.Series(pep_charges, name='charge')
    pep_mass_pd = pd.Series(pep_mass, name='mass')
    contig_pd = pd.Series(contig_vec_no_x, name='contig')

    peptide_dataframe = pd.concat([
        peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd,
        pep_mass_pd, contig_pd
    ],
                                  axis=1)

    current_date = time.strftime("%Y-%m-%d")

    custom_name = output_name
    file_name = custom_name + '_lc-retention-times.csv'
    peptide_dataframe.to_csv(file_name)
Exemplo n.º 13
0
def openms_modelled_rt(rtfilename, output_name):

    seq_rt_df = pd.read_csv(rtfilename, names=['seq_rt'])
    df = pd.DataFrame(seq_rt_df.seq_rt.str.split(' ', 1).tolist(),
                      columns=['pep_seq', 'rts'])
    seq_vec = df['pep_seq'].tolist()
    peptide_rts = df['rts'].tolist()

    print('Removing xs and *s from seqs...')

    seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec]
    # removing contigs with unknown amino acid (X) or selenocysteine (U)
    stars_removed_peps = []
    for starred_peptide in seq_vec_terms:
        line_new = starred_peptide
        # some peptides have unknown amino acids denoted as *, remove them.
        if '*' in line_new:
            continue
    #some peptides have unknown amino acids, remove them.
        if 'X' in line_new:
            continue
        if 'U' in line_new:
            continue
        stars_removed_peps.append(line_new)

    #changing B to asparagine
    b_removed_peps = []
    for b_peptide in stars_removed_peps:
        line_new = re.sub('B', 'N', b_peptide)
        b_removed_peps.append(line_new)

    #changing Z to glutamine
    z_removed_peps = []
    for z_peptide in b_removed_peps:
        line_new = re.sub('Z', 'Q', z_peptide)
        z_removed_peps.append(line_new)

    # #modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream)

    print('Modifying peptides...')
    mod_pep = []
    for tryp_pep in z_removed_peps:
        test_iso = pyteomics.parser.isoforms(tryp_pep,
                                             fixed_mods={
                                                 'ox': ['M'],
                                                 'cam': ['C']
                                             },
                                             show_unmodified_termini=True)
        for blah in test_iso:
            mod_pep.append(blah)

    # # modified amino acid dictionary for mass calculation
    aa_comp = dict(mass.std_aa_comp)
    aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
    aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
    aa_comp['ox'] = mass.Composition({'O': 1})

    #%%
    # calculate peptide isoelectric points, masses, and charge at pH = 7
    print('Calculating peptide physicochemical properties...')
    iso_electric_points = []
    pep_charges = []
    pep_mass = []
    i = 0

    for peptide in mod_pep:
        peptide_isoelectric_point = electrochem.pI(peptide)
        peptide_charge = electrochem.charge(peptide, 7)
        peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp)
        pep_charges.append(peptide_charge)
        iso_electric_points.append(peptide_isoelectric_point)
        pep_mass.append(peptide_mass)
        i += 1

    # Combining the sequences, times, and physicochemical characteristics.
    peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence')
    peptide_rts = pd.Series(peptide_rts, name='rts')
    iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point')
    pep_charges_pd = pd.Series(pep_charges, name='charge')
    pep_mass_pd = pd.Series(pep_mass, name='mass')

    peptide_dataframe = pd.concat([
        peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd,
        pep_mass_pd
    ],
                                  axis=1)

    current_date = time.strftime("%Y-%m-%d")

    custom_name = output_name
    file_name = custom_name + '_lc-retention-times.csv'
    print(file_name)
    peptide_dataframe.to_csv(file_name)