def Model_2(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']]
    X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']]
    Y_train = train[' Label']

    X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test)
    X_train,Y_train = shuffle(X_train,Y_train,random_state = 3)

    # Training
    param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree',
            'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2}

    xgb = XGBClassifier( **param, random_state = 3)
    clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    Y_pred = clf.predict(X_test)

    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_2.csv", index = False)
    result["Label"] = Y_pred
    result.to_csv("Prediction_2.csv", index = False)
Пример #2
0
 def total_mz(self):
     if self._total_mz is None:
         mod = (len(self.peptides) - 1) * mass.calculate_mass(formula="H2")
         total_mass = sum(p.mass for p in self.peptides) - mod
         self._total_mz = (total_mass / self.total_charge
                           ) + mass.calculate_mass(formula="H")
     return self._total_mz
Пример #3
0
def fragments_multi(prot_seq, obs_mass, cal_type, dataframe, tolerance):

    if cal_type == 'mono':
        aa_comp = dict(mass.std_aa_mass)
        ave_cal = False
    else:
        aa_comp = dict(mass.std_aa_comp)
        ave_cal = True

    found = []
    start = 0
    s = int(obs_mass) // 107
    e = int(obs_mass) // 95
    for frag in prot_seq:
        for i in range(s, e):
            if i > len(prot_seq):
                break
            if math.isclose(round(
                    mass.calculate_mass(prot_seq[start:i],
                                        average=ave_cal,
                                        aa_comp=aa_comp), 1),
                            obs_mass,
                            abs_tol=tolerance):
                if i == len(prot_seq):
                    find = [
                        'Single', prot_seq[start],
                        int(start + 1), prot_seq[i - 1],
                        int(i), obs_mass,
                        round(
                            mass.calculate_mass(prot_seq[start:i],
                                                average=ave_cal,
                                                aa_comp=aa_comp), 1),
                        round(
                            obs_mass - round(
                                mass.calculate_mass(prot_seq[start:i],
                                                    average=ave_cal,
                                                    aa_comp=aa_comp), 1), 1)
                    ]
                    found.append(find)
                else:
                    find = [
                        'Double', prot_seq[start],
                        int(start + 1), prot_seq[i - 1],
                        int(i), obs_mass,
                        round(
                            mass.calculate_mass(prot_seq[start:i],
                                                average=ave_cal,
                                                aa_comp=aa_comp), 1),
                        round(
                            obs_mass - round(
                                mass.calculate_mass(prot_seq[start:i],
                                                    average=ave_cal,
                                                    aa_comp=aa_comp), 1), 1)
                    ]
                    found.append(find)
        s += 1
        e += 1
        start += 1

    return (found)
Пример #4
0
def in_silico_fragmentation(fn):
    df = pandas.read_table(fn)
    products = {}
    for i, x in df.iterrows():
        xchg = x['Precursor Charge']
        bseq = x['Base Peptide Sequence']
        seq = x['Peptide Sequence']
        if not products.has_key(seq):
            parseq, theomass, theomz = calc_precursor_theoretical(seq, int(xchg))
            if parseq == None:
                products[seq] = [0.0, 0.0]
                continue

            theoSpec = []
            # for c in xrange(1, int(xchg/2)+1):
            for c in [1]:
                for n in xrange(1, len(bseq)):
                    bproduct = parseq[:n + 1] + [parseq[-1]]
                    yproduct = ['H-'] + parseq[n + 1:]

                    bp = mass.calculate_mass(parsed_sequence=bproduct, ion_type='b', aa_comp=composition, charge=c)
                    yp = mass.calculate_mass(parsed_sequence=yproduct, ion_type='y', aa_comp=composition, charge=c)
                    theoSpec.append(bp)
                    theoSpec.append(yp)

                    # print "b:%d:%f" % (n,bp), bproduct
                    # print "y:%d:%f" % (len(bseq)-n,yp), yproduct

            products[seq] = theoSpec
    return products
Пример #5
0
def lossConvert(loss, charge):
    if loss == '':
        return 0
    elif loss == 'n':
        return massC.calculate_mass(formula='NH3', charge=charge)
    elif loss == 'o':
        return massC.calculate_mass(formula='H2O', charge=charge)
Пример #6
0
def calc_precursor_theoretical(seq, z):
    try:
        parseq = parser.parse(seqModX(seq), labels=modLabels, show_unmodified_termini=True)
        theomass = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition)
        theomz = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition, charge=z)
        return (parseq, theomass, theomz)
    except :
        return (None, None, None)
Пример #7
0
def test_calculate_mass(get_mass):
    assert mass.calculate_mass("ACDE") == pytest.approx(436.12639936, REL)
    assert mass.calculate_mass(mass.Composition("ACDE")) == pytest.approx(
        436.12639936, REL)
    assert mass.calculate_mass(parsed_sequence="ACDE") == pytest.approx(
        418.115834, REL)
    assert mass.calculate_mass("A") == pytest.approx(89.04767846841, REL)

    for data in get_mass:
        sequence = data[0]
        expected = data[1]
        assert mass.calculate_mass(sequence) == pytest.approx(expected, REL)
Пример #8
0
def calc_precursor_theoretical(seq, z):
    try:
        parseq = parser.parse(seqModX(seq),
                              labels=modLabels,
                              show_unmodified_termini=True)
        theomass = mass.calculate_mass(parsed_sequence=parseq,
                                       aa_comp=composition)
        theomz = mass.calculate_mass(parsed_sequence=parseq,
                                     aa_comp=composition,
                                     charge=z)
        return (parseq, theomass, theomz)
    except:
        return (None, None, None)
Пример #9
0
    def __init__(self, sequence='', z=1, ion='H+', modification=''):
        """
        Constructor
        Make it calculate it based on what type of ion it is

                massOfPO4H=massOfPhosphorous+4*massOfOxygen+massOfHydrogen,
        massOfRibose=5*massOfCarbon+7*massOfHydrogen+2*massOfOxygen,
        massOfAdenine=5*massOfCarbon+4*massOfHydrogen+5*massOfNitrogen+massOfRibose,
        massOfCytosine=4*massOfCarbon+4*massOfHydrogen+3*massOfNitrogen+massOfOxygen+massOfRibose,
        massOfGuanine=5*massOfCarbon+4*massOfHydrogen+5*massOfNitrogen+massOfOxygen+massOfRibose,
        massOfThymine=5*massOfCarbon+5*massOfHydrogen+2*massOfNitrogen+massOfOxygen+massOfRibose,
        massOfUracil=4*massOfCarbon+3*massOfHydrogen+2*massOfNitrogen+2*massOfOxygen+massOfRibose;



        """

        self.DNA_in_base['A'] = mass.calculate_mass(formula='C5H4N5')
        self.DNA_in_base['G'] = mass.calculate_mass(formula='C5H4N5O')
        self.DNA_in_base['C'] = mass.calculate_mass(formula='C4H4N3O')
        self.DNA_in_base['T'] = mass.calculate_mass(formula='C5H5N2O2')
        self.DNA_in_base['U'] = mass.calculate_mass(formula='C4H3N2O2')
        self.DNA_in_base['DeoxyRibose'] = mass.calculate_mass(formula='C5H7O2')
        self.DNA_in_base['H'] = mass.calculate_mass(formula='H')
        self.DNA_in_base['H+'] = mass.calculate_mass(formula='H+')
        self.DNA_in_base['O'] = mass.calculate_mass(formula='O')
        self.DNA_in_base['P'] = mass.calculate_mass(formula='P')

        self.std_aa_mass = {
            'G': 57.02146,
            'A': 71.03711,
            'S': 87.03203,
            'P': 97.05276,
            'V': 99.06841,
            'T': 101.04768,
            'C': 103.00919,
            'L': 113.08406,
            'I': 113.08406,
            'N': 114.04293,
            'D': 115.02694,
            'Q': 128.05858,
            'K': 128.09496,
            'E': 129.04259,
            'M': 131.04049,
            'H': 137.05891,
            'F': 147.06841,
            'R': 156.10111,
            'Y': 163.06333,
            'W': 186.07931,
        }
Пример #10
0
 def parse_data(self):
     for key in self.data:
         for rules in self.data[key]:
             if rules["annotation"] == "standard":
                 for masses in rules["losses"]:
                     self.masses.append(mass.calculate_mass(formula=masses))
     self.masses = list(set(self.masses))
Пример #11
0
 def test_Unimod_mass(self):
     db = mass.Unimod(gzip.open('unimod.xml.gz'))
     for x in db.mods:
         self.assertGreater(
             0.00001,
             abs(x['mono_mass'] - mass.calculate_mass(
                 x['composition'], mass_data=db.mass_data)))
Пример #12
0
    def ntps_updateMzBand(self):
        try:
            atom_dict = self.get_atom_range_dict(silent=True)
        except:
            return

        minMassD, maxMassD = {}, {}
        for k, v in atom_dict.iteritems():
            minMassD[k] = min(v)
            maxMassD[k] = max(v)

        minMass = mass.calculate_mass(composition=minMassD)
        maxMass = mass.calculate_mass(composition=maxMassD)

        self.NTPS_mz_band_entry.setText('%s-%s' % (int(minMass), int(maxMass)))
        return
Пример #13
0
def write_to_csv(output_mapping_dict):
    #Writes all information about hits into csv called "spectra_map.csv"
    with open('spectra_map_temp1.csv', "w") as temp1_file:
        writer = csv.writer(temp1_file)
        #legend = ["Antibase Chem Formula", ]
        writer.writerow(output_mapping_dict.keys())
        mass_list = []
        for key in output_mapping_dict.keys():
            mass_list.append(mass.calculate_mass(formula=key))
        writer.writerow(mass_list)
        for val in zip(*output_mapping_dict.values()):
            writer.writerow(val)

    transpose = zip(*csv.reader(open("spectra_map_temp1.csv", "rt")))
    headers = [
        "Antibase Chemical Formula", "Antibase Molecular Weight", "Adduct",
        "Scan/Alignment Number", "RT", "Scan/Alignment M/Z", "PPM"
    ]

    with open('spectra_map_temp2.csv', "w") as temp2_file:
        writer2 = csv.writer(temp2_file)
        writer2.writerow(headers)
        writer2.writerows(transpose)

    df = pd.read_csv('spectra_map_temp2.csv')
    # rearrange column here
    df_reorder = df[[
        'Scan/Alignment Number', 'Scan/Alignment M/Z', 'Adduct',
        'Antibase Chemical Formula', 'Antibase Molecular Weight', 'PPM', 'RT'
    ]]
    df_reorder.to_csv('spectra_map.csv', index=False)

    os.system('rm spectra_map_temp1.csv')
    os.system('rm spectra_map_temp2.csv')
Пример #14
0
def test_annotate_peptide_fragments():
    fragment_tol_mass = 0.02
    fragment_tol_mode = 'Da'
    peptides = [
        'SYELPDGQVITIGNER', 'MFLSFPTTK', 'DLYANTVLSGGTTMYPGIADR', 'YLYEIAR',
        'VAPEEHPVLLTEAPLNPK'
    ]
    for peptide in peptides:
        fragment_mz = np.asarray([
            fragment.calc_mz for fragment in
            spectrum._get_theoretical_peptide_fragments(peptide)
        ])
        fragment_mz += np.random.uniform(-0.9 * fragment_tol_mass,
                                         0.9 * fragment_tol_mass,
                                         len(fragment_mz))
        num_peaks = 150
        mz = np.random.uniform(100, 1400, num_peaks)
        mz[:len(fragment_mz)] = fragment_mz
        intensity = np.random.lognormal(0, 1, num_peaks)
        charge = 2
        spec = spectrum.MsmsSpectrum('test_spectrum',
                                     mass.calculate_mass(sequence=peptide,
                                                         charge=charge),
                                     charge,
                                     mz,
                                     intensity,
                                     peptide=peptide)
        spec.annotate_peptide_fragments(fragment_tol_mass, fragment_tol_mode)
        assert np.count_nonzero(spec.annotation) == len(fragment_mz)
Пример #15
0
def compute_mass_spectrum(sequence, charge=1):
    spectrum = numpy.zeros(len(sequence) - 1)
    for i, iont in enumerate(b_ionts(sequence)):
        spectrum[i] = mass.calculate_mass(sequence=iont,
                                          ion_type='b',
                                          charge=charge)
    return spectrum
def main():
    input_filename = sys.argv[1]
    ppm_tolerance = float(sys.argv[2])
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        peptide = table_data["Peptides"][i]
        all_sub_peptides.append(peptide)
        for length in range(10):
            #substrings = find_all_substring_of_length(peptide, length + 4)
            substrings = [peptide[:length + 4], peptide[length + 4:]]
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    peptide_mass_map = {}
    for peptide in all_sub_peptides:
        peptide_key = peptide + ".2"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=2)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".3"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=3)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".4"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=4)
        peptide_mass_map[peptide_key] = peptide_mass

        #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))

    #Determine uniqueness
    find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
Пример #17
0
def calculate_b_y_ion(sequence, ion_charge):
    aa_comp = dict(mass.std_aa_comp)
    aa_comp['C'] = mass.Composition({'H': 8, 'C': 5, 'S': 1, 'O': 2, 'N': 2})
    b_ion = [
        mass.calculate_mass(sequence[:aa],
                            ion_type='b',
                            charge=ion_charge,
                            aa_comp=aa_comp) for aa in range(1, len(sequence))
    ]  # aa = the amino acid residue
    y_ion = [
        mass.calculate_mass(sequence[aa:],
                            ion_type='y',
                            charge=ion_charge,
                            aa_comp=aa_comp) for aa in range(1, len(sequence))
    ]
    y_ion.reverse()  # record from small to big
    return (tuple(b_ion), tuple(y_ion))
Пример #18
0
def mass_diff(amino_acid, mass):
    """

    >>> round(mass_diff("M", 147.04), 2)
    16.0
    """
    unmodified_mass = calculate_mass(composition=Composition(parsed_sequence=[amino_acid]))
    return mass - unmodified_mass
def main():
    input_filename = sys.argv[1]
    ppm_tolerance = float(sys.argv[2])
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        peptide = table_data["Peptides"][i]
        all_sub_peptides.append(peptide)
        for length in range(10):
            #substrings = find_all_substring_of_length(peptide, length + 4)
            substrings = [peptide[:length+4], peptide[length+4:]]
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    peptide_mass_map = {}
    for peptide in all_sub_peptides:
        peptide_key = peptide + ".2"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".3"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".4"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)
        peptide_mass_map[peptide_key] = peptide_mass


        #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))


    #Determine uniqueness
    find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def get_peptide_data(peptide):
    """ Get data for a given peptide. """
    peptide_data = {'sequence': peptide}
    peptide_data['parsed_sequence'] = parser.parse(
        peptide,
        show_unmodified_termini=True # keep the termini, for mass calculations.
    )
    peptide_data['mass'] = mass.calculate_mass(
        peptide_data['parsed_sequence']
    )
    return peptide_data
Пример #21
0
 def get_seq_mass(self, start=None, end=None, term="n", **kwds):
     # kwds could be average, ion_type, etc...
     slice = _Slice(self.sequence, start, end, term)
     sub_sequence = self.sub_sequence(slice=slice)
     composition = Composition(list(sub_sequence))
     # Why is adding H2O needed, is there a more pyteomics way of doing this?
     kwds['ion_comp'] = ION_COMP
     mass = calculate_mass(composition=composition, **kwds)
     mass = self.__add_res_mod_masses(mass, slice, term, **kwds)
     mass = self.__add_term_mod_mass(mass, slice, term, **kwds)
     return mass
Пример #22
0
def main():
    input_filename = sys.argv[1]
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        for length in range(10):
            peptide = table_data["Peptides"][i]
            substrings = find_all_substring_of_length(peptide, length + 4)
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    for peptide in all_sub_peptides:
        print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))
Пример #23
0
def _get_theoretical_peptide_fragments(peptide: str, types: str = 'by',
                                       max_charge: int = 1):
    """
    Get theoretical fragments for the given peptide.

    Parameters
    ----------
    peptide : str
        The peptide sequence for which the fragments will be generated.
    types : str, optional
        The fragment type. Can be any combination of 'a', 'b', 'c', 'x', 'y',
        and 'z' (the default is 'by', which means that b-ions and y-ions will
        be generated).
    max_charge : int, optional
        All fragments up to and including the given charge will be generated
        (the default is 1 to only generate singly-charged fragments).

    Returns
    -------
        A list of all fragments as (`FragmentAnnotation`, m/z) tuples sorted in
        ascending m/z order.
    """
    ions = []
    amino_acids = parser.parse(peptide)
    for i in range(1, len(amino_acids)):
        for ion_type in types:
            for charge in range(1, max_charge + 1):
                if ion_type in 'abc':
                    ions.append((
                        FragmentAnnotation(ion_type, i, charge),
                        mass.calculate_mass(sequence=''.join(amino_acids[:i]),
                                            ion_type=ion_type,
                                            charge=charge)))
                else:
                    ions.append((
                        FragmentAnnotation(ion_type, len(peptide) - i, charge),
                        mass.calculate_mass(sequence=''.join(amino_acids[i:]),
                                            ion_type=ion_type,
                                            charge=charge)))
    return sorted(ions, key=operator.itemgetter(1))
Пример #24
0
def getCIDFragmentIons(sequence, charge):
    """
    Generate CID fragments for a given peptide sequence and charge,
    and calculates their monoisotopic m/z values.

    First, all possible b and y ion fragments are generated.
    Then, the monoisotopic m/z values are calculated for the given charge.

    This method makes use of the pyteomics package to compute the
    monoisotopic m/z values. For more information, please refer to:
    https://pythonhosted.org/pyteomics/mass.html

    Parameters
    ----------
    sequence : str
        The peptide which will be fragmented.
    charge: int
        The charge of the b and y ions that will be computed.

    Returns
    -------
    yFragmentMasses : ndarray
        A numpy array containing the monoisotopic m/z values of the y ion fragments.
    bFragmentMasses : ndarray
        A numpy array containing the monoisotopic m/z values of the b ion fragments.
    """

    # generate y and b fragment sequences in a list
    yFragments = [sequence[i:] for i in range(len(sequence))]
    bFragments = [sequence[:i + 1] for i in range(len(sequence))]

    # calculate masses for sequences in y/b-lists
    yFragmentMasses = np.fromiter(
        (mass.calculate_mass(sequence=yIon, ion_type='y', charge=charge)
         for yIon in yFragments), np.float)
    bFragmentMasses = np.fromiter(
        (mass.calculate_mass(sequence=bIon, ion_type='b', charge=charge)
         for bIon in bFragments), np.float)

    return yFragmentMasses, bFragmentMasses
Пример #25
0
    def __init__(self):
        """
        Constructor
        """

        self.modifications = []

        self.my_mods = {
            '': 0,
            '+BS3': mass.calculate_mass(formula='C8H10O2'),
            'BS3x2': mass.calculate_mass(formula='C16H20O4'),
            '-H2O': -mass.calculate_mass(formula='H2O'),
            '-NH3': -mass.calculate_mass(formula='NH3'),
            'S-S': -mass.calculate_mass(formula='H2'),
            'S-Sx2': -mass.calculate_mass(formula='H4'),
            'S-Sx3': -mass.calculate_mass(formula='H6'),
            '-H20x2': -mass.calculate_mass(formula='H4O2'),
            '-H20x3': -mass.calculate_mass(formula='H6O3'),
            '-H2O-NH3': -mass.calculate_mass(formula='H5ON'),
            '-dHA': -34,
            '+thio': +32,
        }
Пример #26
0
def _create_mgf_entry(peptide, charge=2):
    """Create a MassIVE-KB style MGF entry for a single PSM.

    Parameters
    ----------
    peptide : str
        A peptide sequence.
    charge : int, optional
        The peptide charge state.

    Returns
    -------
    str
        The PSM entry in an MGF file format.
    """
    mz = calculate_mass(peptide, charge=int(charge))
    frags = []
    for idx in range(len(peptide)):
        for zstate in range(1, charge):
            b_pep = peptide[:idx + 1]
            frags.append(
                str(calculate_mass(b_pep, charge=zstate, ion_type="b")))

            y_pep = peptide[idx:]
            frags.append(
                str(calculate_mass(y_pep, charge=zstate, ion_type="y")))

    frag_string = " 1\n".join(frags) + " 1"

    mgf = [
        "BEGIN IONS",
        f"SEQ={peptide}",
        f"PEPMASS={mz}",
        f"CHARGE={charge}+",
        f"{frag_string}",
        "END IONS",
    ]
    return "\n".join(mgf)
Пример #27
0
def read_compounds(filename,
                   separator="\t",
                   calculate=True,
                   lib_adducts=[],
                   filename_atoms=""):

    if calculate:
        path_nist_database = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'data',
            'nist_database.txt')
        nist_database = nist_database_to_pyteomics(path_nist_database)

    df = read_csv(filename, sep=separator, float_precision="round_trip")
    records = []
    for index, row in df.iterrows():
        record = collections.OrderedDict()
        comp = pyteomics_mass.Composition(str(row.molecular_formula))
        if comp:
            record["composition"] = collections.OrderedDict(
                (k, comp[k]) for k in order_composition_by_hill(comp.keys()))
            sum_CHNOPS = sum(
                [comp[e] for e in comp if e in ["C", "H", "N", "O", "P", "S"]])
            record["CHNOPS"] = sum_CHNOPS == sum(list(comp.values()))
            if calculate:
                record["exact_mass"] = round(
                    pyteomics_mass.calculate_mass(formula=str(
                        str(row.molecular_formula)),
                                                  mass_data=nist_database), 6)
            else:
                record["exact_mass"] = float(row.exact_mass)

            record["compound_id"] = row.compound_id
            record["compound_name"] = row.compound_name
            comp = pyteomics_mass.Composition(str(row.molecular_formula))
            record["molecular_formula"] = composition_to_string(comp)

            if "retention_time" in df.columns:
                record["retention_time"] = row.retention_time
            elif "rt" in df.columns:
                record["retention_time"] = row.rt
            if "adduct" in df.columns:
                record["adduct"] = row.adduct
                if lib_adducts and calculate:
                    record["exact_mass"] += lib_adducts.lib[row.adduct]["mass"]

            records.append(record)
        else:
            Warning("{} Skipped".format(row))

    return records
Пример #28
0
def in_silico_fragmentation(fn):
    df = pandas.read_table(fn)
    products = {}
    for i, x in df.iterrows():
        xchg = x['Precursor Charge']
        bseq = x['Base Peptide Sequence']
        seq = x['Peptide Sequence']
        if not products.has_key(seq):
            parseq, theomass, theomz = calc_precursor_theoretical(
                seq, int(xchg))
            if parseq == None:
                products[seq] = [0.0, 0.0]
                continue

            theoSpec = []
            # for c in xrange(1, int(xchg/2)+1):
            for c in [1]:
                for n in xrange(1, len(bseq)):
                    bproduct = parseq[:n + 1] + [parseq[-1]]
                    yproduct = ['H-'] + parseq[n + 1:]

                    bp = mass.calculate_mass(parsed_sequence=bproduct,
                                             ion_type='b',
                                             aa_comp=composition,
                                             charge=c)
                    yp = mass.calculate_mass(parsed_sequence=yproduct,
                                             ion_type='y',
                                             aa_comp=composition,
                                             charge=c)
                    theoSpec.append(bp)
                    theoSpec.append(yp)

                    # print "b:%d:%f" % (n,bp), bproduct
                    # print "y:%d:%f" % (len(bseq)-n,yp), yproduct

            products[seq] = theoSpec
    return products
Пример #29
0
def getCIDFragmentIons(sequence,charge):
    """
    Generate CID fragments for a given peptide sequence and charge,
    and calculates their monoisotopic m/z values.

    First, all possible b and y ion fragments are generated.
    Then, the monoisotopic m/z values are calculated for the given charge.

    This method makes use of the pyteomics package to compute the
    monoisotopic m/z values. For more information, please refer to:
    https://pythonhosted.org/pyteomics/mass.html

    Parameters
    ----------
    sequence : str
        The peptide which will be fragmented.
    charge: int
        The charge of the b and y ions that will be computed.

    Returns
    -------
    yFragmentMasses : ndarray
        A numpy array containing the monoisotopic m/z values of the y ion fragments.
    bFragmentMasses : ndarray
        A numpy array containing the monoisotopic m/z values of the b ion fragments.
    """

    # generate y and b fragment sequences in a list
    yFragments = [sequence[i:] for i in range(len(sequence))]
    bFragments = [sequence[:i+1] for i in range(len(sequence))]

    # calculate masses for sequences in y/b-lists
    yFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=yIon,ion_type='y',charge=charge) for yIon in yFragments),np.float)
    bFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=bIon,ion_type='b',charge=charge) for bIon in bFragments),np.float)

    return yFragmentMasses, bFragmentMasses
Пример #30
0
def createLinks(antiBase_map):
    links_list = []
    for key in antiBase_map.keys():
        for j in range(0, len(antiBase_map[key][1])):
            temp_link = {}
            temp_link[
                "source"] = "Antibase Chem Formula: " + key + " , " + "Antibase MW: " + str(
                    mass.calculate_mass(formula=key))
            temp_link["target"] = "Scan/alignment num: " + str(
                antiBase_map[key][1][j]) + " , " + "M/Z: " + str(
                    antiBase_map[key][3][j]) + " , " + "Adduct type: " + str(
                        antiBase_map[key][0][j])
            temp_link["value"] = 1 / (antiBase_map[key][4][j] * 10**7)
            links_list.append(temp_link)
    return links_list
Пример #31
0
def Model_1(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']]
    X_test = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']]
    Y_train = train[' Label']

    # Training
    clf = BaggingClassifier(base_estimator =  RandomForestClassifier(random_state = 2), n_estimators = 100, random_state = 2, n_jobs = -1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    Y_pred = clf.predict(X_test)

    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_1.csv", index = False)
    result["Label"] = Y_pred
    result.to_csv("Prediction_1.csv", index = False)
Пример #32
0
def process_stack(stack, topN=None):
    assert len(stack) > 7
    print(stack)
    global cnt, cnt_trgr
    header = stack[:7]
    fragments = stack[7:]
    nr_samples = stack[3].split(":")[1].split("/")[0]
    nr_isotopes = stack[5].split(":")[1]
    nr_spectra = stack[6].split(":")[1]
    sumformula = header[0].split(": ")[1]
    charge = int(header[1].split(": ")[1])
    rt = float(header[2].split(": ")[1])
    precursor_mz = abs(mass.calculate_mass(formula=sumformula, charge=charge))
    #
    res = []
    tmpres = []
    for f in fragments:
        mz, inten = f.split("\t")
        mz = float(mz)
        tmpres.append([
            precursor_mz,
            mz,
            rt,
            "%s_%s"  % (cnt, sumformula),
            -1,
            inten,
            "%s_%s"  % (cnt_trgr, sumformula), # transition_group_id
            1, # decoy
            "", # sum-formula
            "", # protein name
            sumformula,
            sumformula,
            charge,
            "light"]
        )

        cnt += 1

    cnt_trgr += 1

    # Sort by intensity

    tmpres.sort(key=lambda x: float(x[5]),reverse = True)

    if topN:
        tmpres = tmpres[:topN]
    res.extend(tmpres)
    return res
Пример #33
0
 def loadDB(self, peptideList, minlen=1, maxlen=100):
     '''should take a list such as that generated by ms1pep.digestpeptidedb()'''
     sql='''INSERT INTO peptide_fragment (protein_accession,fragment_database_id, fragment_sequence, fragment_start, fragment_end, fragment_mono_mass) values (%s,%s,%s,%s,%s,%s)'''
     with database.ConnectMySQL(self.host, self.user, self.password,self.database) as sqlCon:
         cursor=sqlCon.cursor
         peps=0
         for p in peptideList:
             try:
                 assert p.has_key("sequence") and p.has_key("start") and p.has_key("end") and p.has_key("proteinID")
                 mr=mass.calculate_mass(p['sequence'])
                 if minlen<=len(p['sequence']) and maxlen >=len(p['sequence']):
                     cursor.execute(sql, (p['proteinID'],self.dbid, p['sequence'],p['start'],p['end'],mr))
                     peps=peps+1
             except Exception, e:
                 warnings.warn("error including sequence %s::%s : %s"%(p['proteinID'], p['sequence'], e))
         warnings.warn("Uploaded %s peptides to database %s"%(peps, self.dbtag))
Пример #34
0
    def __init__(self, peptide, charge, mods_mass):

        self.charge = charge
        self.peptide = peptide

        # calculate neutral mass of peptide
        self.target = mass.calculate_mass(
            sequence=self.peptide, ion_type='M',
            charge=charge) + float(mods_mass) / float(charge)

        # calculate upper and lower m/z limits
        self.targetLL = self.target - self.target / 1000000 * options.ppm
        self.targetHL = self.target + self.target / 1000000 * options.ppm

        self.targetIntensityDIct = {}
        self.targetScanCounter = {}
        return
Пример #35
0
def test_annotate_peaks_most_intense():
    fragment_tol_mass = 0.02
    fragment_tol_mode = 'Da'
    peptide = 'YLYEIAR'
    fragment_mz = np.asarray([mz for _, mz in
                              spectrum._get_theoretical_peptide_fragments(
                                peptide)])
    mz = np.asarray([fragment_mz[0] - 0.01, fragment_mz[0] + 0.01])
    intensity = np.asarray([10, 20])
    charge = 2
    spec = spectrum.MsmsSpectrum(
        'test_spectrum', mass.calculate_mass(sequence=peptide,
                                             charge=charge),
        charge, mz, intensity, peptide=peptide)
    spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode,
                        peak_assignment='most_intense')
    assert spec.annotation[0] is None
    assert spec.annotation[1] is not None
Пример #36
0
    def create_search_space_peptide_fragments(self, fragments, charges,
                                              modifications):
        """

        Args:
            fragments: An array of Fragment objects
            charges: The charges to consider
            modifications: The modifications to consider
            modifications is a dictionary with first element the
            mod name and second the residue(s) or positions affected

        Returns:
            A pandas dataframe with the fragment match information

        """
        peptide_column_headers = [
            'Sequence', 'Ion', 'Charge', 'Mass_Theor', 'Modifications'
        ]

        #TODO implement modifications!
        data_to_save = [
        ]  # print "Searching mass {0:10.3f}".format(experimental_mass)

        for fragment in fragments:

            for z in charges:

                frag_string_modified = ''

                fragm_mz = mass.calculate_mass(sequence=fragment.sequence,
                                               ion_type=fragment.ion[0],
                                               charge=z)

                data_to_save.append([
                    fragment.sequence, fragment.ion, z, fragm_mz,
                    frag_string_modified
                ])

                # print '{0:<30} {1:<5} {2:<3} {3:10.3f} {4:10.3f} {5:10.3f} {6:7.1f} {7}'.format(fragment.sequence, fragment.ion, z, experimental_mass, experimental_intensity, fragm_mz, ppm_calculated, frag_string_modified)

        df = pd.DataFrame(data_to_save, columns=peptide_column_headers)
        #df.to_csv('matched_fragments.csv')

        return df
def get_frag_mz(one_hot, ion_position, ion_type, ion_charge):
    pep_seq = reverse_one_hot_encode(one_hot, amino_acid_modified_codes)

    if ion_type == 'b':
        ion_seq = pep_seq[:ion_position]
    elif ion_type == 'y':
        ion_seq = pep_seq[-ion_position:]

    # # TODO: Figure out how to use aacomp
    # count = ion_seq.count("!")
    # ion_seq =ion_seq.replace("!", "C")
    for key, value in dumb_reversal.items():
        ion_seq = ion_seq.replace(key, value)
    mz = mass.calculate_mass(sequence=ion_seq,
                             ion_type=ion_type,
                             charge=int(ion_charge),
                             aa_comp=aa_comp)
    # mz += count * float(2*12 + 1 + 14)/int(ion_charge)
    return mz
Пример #38
0
def test_annotate_peaks_nearest_mz():
    fragment_tol_mass = 0.02
    fragment_tol_mode = 'Da'
    peptide = 'YLYEIAR'
    fragment_mz = np.asarray([fragment.calc_mz for fragment in
                              spectrum._get_theoretical_peptide_fragments(
                                  peptide)])
    mz = np.asarray([fragment_mz[0] - 0.005, fragment_mz[0] + 0.015])
    intensity = np.asarray([10, 20])
    charge = 2
    spec = spectrum.MsmsSpectrum(
        'test_spectrum', mass.calculate_mass(sequence=peptide,
                                             charge=charge),
        charge, mz, intensity, peptide=peptide)
    spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode,
                        peak_assignment='nearest_mz')
    assert spec.annotation[0] == spectrum.FragmentAnnotation('b', 1, 1,
                                                             fragment_mz[0])
    assert spec.annotation[1] is None
Пример #39
0
def listmz(peptide, charges=[2,3,4], fixedmods={},modifications=[]):
  ''' Calculates the mz values for a given peptide with modifications for each of the charges listed in charges
    Default is to calculate 2+, 3+ and 4+
    listmz(peptide, charges=[2,3,4], fixedmods={"C": 56.0987}, modifications=['3 Phospho (STY)']
    '''
  hmass=float(Unimod.unimod.database.get_element('H')['mono_mass'])
  mz=float(mass.calculate_mass(peptide))
  for p in modifications:
      m=re.match(r'(\d+) +(.*) +\(([^\)]*)\) *$', p)
      if m:
          pos=int(m.group(1))
          label=m.group(2)
          aa=m.group(3)
          if peptide[pos-1] in aa:
              mz = mz + float(Unimod.unimod.database.get_label(label)['delta_mono_mass'])
  for k in fixedmods.keys():
      for a in peptide:
          if a==k:
              mz = mz + float(fixedmods[k])
  mzcalc=[]
  for c in charges:
      mzcalc.append(hmass+(mz/c))
  return mzcalc
        sys.exit()

inputfile01 = open(input_file, "r")
# outputfile1 = open(output_file,'w')

from pyteomics import parser
from pyteomics import mass


# gene_list = ['SAA1']
# gene_list = open(gene_list,'r')
counter = 0
errcounter = 0
pepinput = "MALTSEYWIILR"
ps0 = parser.parse(pepinput, show_unmodified_termini=True)
referencemass = mass.calculate_mass(parsed_sequence=ps0)
mass_tolerance = 7  # unit: ppm
targetmass = 1422.730378
total_pep_list = []
for num, x in enumerate(SeqIO.parse(inputfile01, "fasta")):
    if num % 10000 == 0:
        print num
    # if num > 5000:
    #    break
    pro = str(x.seq)
    peplist = digest(pro, enzyme, missed_cleavage, min_pep_length, max_pep_length)
    if len(peplist) > 0:
        for p in peplist:
            total_pep_list.append(p)
sort_list = list(set(total_pep_list))
for num1, pep in enumerate(sort_list):