Пример #1
0
 def parse_mz_id(self):
     data = mzid.read(self.path)
     max_rank = 2
     for d in data:
         title = d['spectrum title']
         scan_id = parse_scan_id(title)
         fragments = []
         len_frag = len(d['SpectrumIdentificationItem'])
         pos = 1
         while(pos <= min(max_rank, len_frag)):
             for fragmentation in d['SpectrumIdentificationItem'][pos - 1]['IonType']:  # 0 because just first rank
                 for f in fragmentation['FragmentArray']:
                     if f['measure_ref'] == 'm_mz':
                         mz = f['values']
                     elif f['measure_ref'] == 'm_error':
                         error = f['values']
                     else:
                         pass
                 fragments.append(Fragment(name=fragmentation['name'],
                                  indice=fragmentation['index'],
                                  charge=fragmentation['charge'],
                                  mz=mz,
                                  error=error,
                                  scanid=parse_scan_id(title)))
             if scan_id in self.scan1.data:
                 self.scan1.data[scan_id][pos] = fragments
             else:
                 self.scan1.data[scan_id] = {}
                 self.scan1.data[scan_id][pos] = fragments
             pos += 1
Пример #2
0
    def parse_mz_id(self):
        """
        reading mzid
        saving every spectrum identification (but just rank 1)
        returns:
        None
        """
        data = mzid.read(self.path)

        for d in data:
            title = parse_scan_id(d['spectrum title'])
            ident = {}
            len_ranks = len(d['SpectrumIdentificationItem'])
            if len_ranks > 1:
                for i in [0, 1]:
                    identification = d['SpectrumIdentificationItem'][i]  # 0 because just first rank
                    peptide_ref = identification['peptide_ref']
                    ident[i + 1] = Identification(mzid_info_lvl_fragmentation=identification['IonType'],
                                                  peptide_ref=peptide_ref,
                                                  title=title)
            else:
                identification = d['SpectrumIdentificationItem'][0]  # 0 because just first rank
                peptide_ref = identification['peptide_ref']
                ident[1] = Identification(mzid_info_lvl_fragmentation=identification['IonType'],
                                          peptide_ref=peptide_ref,
                                          title=title)

            self.identifications.append(ident)
    def testPyteomicsParsing(self):
        reader = mzid.read(self.testfile)

        n_decoy = 0
        n_target = 0
        for spec_ref in reader:
            for spec_id in spec_ref["SpectrumIdentificationItem"]:
                if spec_id["passThreshold"]:
                    n_target += 1
                else:
                    n_decoy += 1

            # spectrumID
            # spectrum title
            # SpectrumIdentificationItem (list)
            # - dict
            # - Mascot:score
            # - Mascot:identify threshold
            # - Scaffold:Peptide Probability
            # - chargeState
            # - experimentalMassToCharge
            # - passThreshold (bool)
            # - peptide_ref (str = sequence)
            # - rank (int)

        self.assertEqual(6573, n_target)
        self.assertEqual(14095, n_decoy)
Пример #4
0
def get_PSM_mzid(psm_file):
    '''
    :param psm_file: mzid file
    :return: dictionairy of parsed mzid file, suitable for proBAMconvert
    '''

    with mzid.read(psm_file) as PSM:
        psm_hash=[]
        accession_hash=_get_accessions_(psm_file)
        for row in PSM:
            temp_hash={"assumed_charge":row['SpectrumIdentificationItem'][0]['chargeState'],"spectrum":row['spectrumID'],"search_hit":[]}
            for psm in row["SpectrumIdentificationItem"]:
                if "Modification" in psm.keys():
                    mod_list=psm['Modification']
                else:
                    mod_list=[]
                proteins=[]
                massdiff=_cal_massdiff_(psm['experimentalMassToCharge'],psm['calculatedMassToCharge'])
                for protein in psm["PeptideEvidenceRef"]:
                    proteins.append({"protein":accession_hash[protein['peptideEvidence_ref']]})
                mod_peptide=_get_mod_peptide_sequence_(psm['peptide_ref'],mod_list)
                modifications=_get_peptide_modifications_(mod_list)
                score=_get_score_(psm)

                temp_hash['search_hit'].append({"hit_rank":psm['rank'],"modifications":modifications,
                                                "modified_peptide":mod_peptide,"peptide":psm['peptide_ref'],
                                                "search_score":{"score":_get_score_(psm),"evalue":_get_evalue_(psm)},
                                                "proteins":proteins,"num_missed_cleavages":"0",
                                                "massdiff":massdiff})
            psm_hash.append(temp_hash)
    return psm_hash
Пример #5
0
 def load(self, f, source_statistic_names):
     dbg_limit = 10
     counter = 0
     title_pat = '^.*\.([0-9]+)\.[0-9]$'
     psms = []
     for identification_result in read(f, retrieve_refs=True):
         scan_id = identification_result['spectrumID']
         scan_num = None
         spectrum_title = identification_result.get('spectrum title',None)
         if spectrum_title:
             m = re.match(title_pat,spectrum_title)
             if m:
                 scan_num = int(m.groups()[0])
         scan_source = self.scan_source_manager.match_by_name(identification_result['name'])
         if counter < dbg_limit:
             print >> sys.stdout, "MzIdLoader %s" % identification_result
         ## source, index=None, number=None, id=None, base_peak_mz=None)
         scan_reference = ScanReference(id=scan_id, number=scan_num, source=scan_source)
         if counter < dbg_limit:
             print >> sys.stdout, "MzIdLoader scan %s\t%s\t%s\t%s\t%s" % ( scan_reference.id, scan_reference.index, scan_reference.number, scan_reference.base_peak_mz, scan_reference.source)
         counter += 1
         for identification_item in identification_result['SpectrumIdentificationItem']:
             psm = self._identification_to_psm(identification_item, scan_reference, source_statistic_names)
             psms.append(psm)
     return psms
Пример #6
0
 def test_structure_normalization(self):
     gen = read('mzid_snippet.xml').iterfind("SpectraData")
     datum = next(gen)
     index = aux.cvquery(datum)
     assert index['MS:1000768'] == 'Thermo nativeID format'
     datum = next(gen)
     index = aux.cvquery(datum)
     assert index['MS:1000774'] == 'multiple peak list nativeID format'
Пример #7
0
 def load(self, f, source_statistic_names):
     psms = []
     for identification_result in read(f, retrieve_refs=True):
         scan_id = identification_result['spectrumID']
         scan_source = self.scan_source_manager.match_by_name(identification_result['name'])
         scan_reference = ScanReference(id=scan_id, source=scan_source)
         for identification_item in identification_result['SpectrumIdentificationItem']:
             psm = self._identification_to_psm(identification_item, scan_reference, source_statistic_names)
             psms.append(psm)
     return psms
Пример #8
0
def mzidentml(
    fil
):  #This function generates a list of all the peptide identifications from a mzIdentML file
    from pyteomics import mzid
    with mzid.read(fil, read_schema=False) as f:  #reads the mzid file
        sequences = [
            item['SpectrumIdentificationItem'][0]['PeptideSequence']
            for item in f
        ]  #for each Spectrum Identification Item gets the
    return sequences  # peptide sequences.
Пример #9
0
def get_indices_pyteomics(path_to_mzid):
    """
    Given a dictionary with the .mzid file, go through every
    SpectrumIdentificationResult and match the mgf TITLE to the initial part of
    the Percolator index. Save these correspondences in a dictionary.
    https://github.com/percolator/percolator/issues/147
    """
    index_map = {}
    for a in mzid.read(path_to_mzid):
        index_map[a['spectrumID'].split('=')[1] + '_' + str(
            a['SpectrumIdentificationItem'][0]['rank'])] = a['spectrum title']
    return index_map
Пример #10
0
def load_mzid(fn, qval=0.01):
    psms = []
    specids = [0]
    psmReader = mzid.read(fn)
    for psm in psmReader:
        if 'SpectrumIdentificationItem' in psm:
            try:
                specids.append( int(psm['scan number(s)']))
            except KeyError:
                specids.append( int( psm['spectrumID'].split('=')[-1] ))
            else:
                pass

            for match in psm['SpectrumIdentificationItem']:
                if match['MS-GF:QValue'] < qval and match['rank'] == 1 and match['IsotopeError'] == 0 and 2 <= match['chargeState'] <= 4:
                    dm = match['experimentalMassToCharge'] - match['calculatedMassToCharge']
                    dm = dm * 1e6 / match['calculatedMassToCharge']
                    psms.append(dm)
    return numpy.array(psms), max(specids)
Пример #11
0
def mzid2pin(fn):
    db = _dbrefs(fn)
    psmReader = mzid.read(fn)
    sys.stdout = open(fn + '.idxml', 'wb')

    print r'''<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="http://open-ms.sourceforge.net/XSL/IdXML.xsl" ?>
<IdXML version="1.2" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/SCHEMAS/IdXML_1_2.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <SearchParameters id="SP_0" db="" db_version="" taxonomy="" mass_type="monoisotopic" charges="" enzyme="Trypsin/P" missed_cleavages="3" precursor_peak_tolerance="10" peak_mass_tolerance="20" ></SearchParameters>
    <IdentificationRun date="" search_engine="" search_engine_version="" search_parameters_ref="SP_0" > '''

    for psm in psmReader:
        if psm.has_key('SpectrumIdentificationItem'):
            specId = psm['spectrumID']
            specTitle = psm['spectrum title'].split(':')

            featMZ = float(specTitle[1])
            rt = specTitle[2][:-1]

            for match in psm['SpectrumIdentificationItem']:
                if match['MS-GF:QValue'] > 0.01:
                # if match['MS-GF:PepQValue'] > 0.01:
                    continue

                z = match['chargeState']
                if not 2 <= z <= 4:
                    continue

                expMZ = match['experimentalMassToCharge']
                mz = match['calculatedMassToCharge']  # - (expMZ - featMZ)
                score = match['MS-GF:EValue']

                flag, enzN, enzC, prot = db[match['PeptideEvidenceRef'][0]['peptideEvidence_ref']]
                pepref = match['peptide_ref']
                bseq, seq = db[pepref]

                print ''' <PeptideIdentification score_type="" higher_score_better="false" significance_threshold="0" MZ="%f" RT="%s" >
            <PeptideHit score="%f" sequence="%s" charge="%d" aa_before="%s" aa_after="%s"></PeptideHit>
        </PeptideIdentification>
                ''' % (mz, rt, score, seq, z, enzN, enzC)

    print '\n</IdentificationRun>\n</IdXML>'
Пример #12
0
def load_mzid(fn, qval=0.001):
    from pprint import pprint
    psms = []
    specids = [0]
    psmReader = mzid.read(fn)
    for psm in psmReader:
        if psm.has_key('SpectrumIdentificationItem'):
            try:
                specids.append( int(psm['scan number(s)']))
            except KeyError:
                specids.append( int( psm['spectrumID'].split('=')[-1] ))
            else:
                pass

            for match in psm['SpectrumIdentificationItem']:
                if match['MS-GF:QValue'] < qval and match['rank'] == 1 and match['IsotopeError'] == 0 and 2 <= match['chargeState'] <= 4:
                    dm = match['experimentalMassToCharge'] - match['calculatedMassToCharge']
                    dm = dm * 1e6 / match['calculatedMassToCharge']
                    psms.append(dm)
    return numpy.array(psms), max(specids)
Пример #13
0
def get_PSM_mzid(psm_file):
    '''
    :param psm_file: mzid file
    :return: dictionairy of parsed mzid file, suitable for proBAMconvert
    '''

    with mzid.read(psm_file) as PSM:
        psm_hash=[]
        accession_hash=_get_accessions_(psm_file)
        mod_hash=_get_modification_(psm_file)
        sequence_hash=_get_peptide_sequence_hash(psm_file)
        spectraData_ref={}
        c=0
        for row in PSM:
            if 'spectraData_ref' in row:
                if row['spectraData_ref'] not in spectraData_ref:
                    spectraData_ref[row['spectraData_ref']]=c
                    c+=1
                row['spectrumID']='ms_run['+str(spectraData_ref[row['spectraData_ref']])+']:'+row['spectrumID']
            temp_hash={"assumed_charge":row['SpectrumIdentificationItem'][0]['chargeState'],"spectrum":row['spectrumID'],"search_hit":[]}
            for psm in row["SpectrumIdentificationItem"]:
                if psm['passThreshold']==True:
                    proteins=[]
                    massdiff=_cal_massdiff_(psm['experimentalMassToCharge'],psm['calculatedMassToCharge'])
                    for protein in psm["PeptideEvidenceRef"]:
                        proteins.append({"protein":accession_hash[protein['peptideEvidence_ref']]})
                    temp_hash['search_hit'].append({"hit_rank":psm['rank'],"modifications":mod_hash[psm['peptide_ref']],
                                                    "calc_neutral_pep_mass": psm['experimentalMassToCharge'],
                                                    "precursor_neutral_mass": psm['calculatedMassToCharge'],
                                                    "peptide":sequence_hash[psm['peptide_ref']],
                                                    "search_score":{"score":_get_score_(psm),"evalue":_get_evalue_(psm)},
                                                    "proteins":proteins,"num_missed_cleavages":"0",
                                                    "massdiff":massdiff})
            psm_hash.append(temp_hash)
    PSM.close()
    del mod_hash
    del sequence_hash
    del accession_hash
    return psm_hash
Пример #14
0
def mzid2pin(fn, max_charge=7):
    fh = open(fn + '.idxml', 'w')
    db = _dbrefs(fn)
    psmReader = mzid.read(fn)
    print(r'''<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="http://open-ms.sourceforge.net/XSL/IdXML.xsl" ?>
<IdXML version="1.2" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/SCHEMAS/IdXML_1_2.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <SearchParameters id="SP_0" db="" db_version="" taxonomy="" mass_type="monoisotopic" charges="" enzyme="Trypsin/P" missed_cleavages="3" precursor_peak_tolerance="10" peak_mass_tolerance="20" ></SearchParameters>
    <IdentificationRun date="" search_engine="" search_engine_version="" search_parameters_ref="SP_0" > ''',
          file=fh)

    for psm in psmReader:
        if 'SpectrumIdentificationItem' not in psm:
            continue
        rt = psm['scan start time']
        for match in psm['SpectrumIdentificationItem']:
            if match['MS-GF:QValue'] > 0.01:
                # if match['MS-GF:PepQValue'] > 0.01:
                continue
            z = match['chargeState']
            if not 2 <= z <= max_charge:
                continue
            expMZ = match['experimentalMassToCharge']
            mz = match['calculatedMassToCharge']  # - (expMZ - featMZ)
            score = match['MS-GF:EValue']

            flag, enzN, enzC, prot = db[match['PeptideEvidenceRef'][0]
                                        ['peptideEvidence_ref']]
            pepref = match['peptide_ref']
            bseq, seq = db[pepref]
            print(
                ''' <PeptideIdentification score_type="" higher_score_better="false" significance_threshold="0" MZ="%f" RT="%s" >
            <PeptideHit score="%f" sequence="%s" charge="%d" aa_before="%s" aa_after="%s"></PeptideHit>
        </PeptideIdentification>
                ''' % (mz, rt, score, seq, z, enzN, enzC),
                file=fh)

    print('\n</IdentificationRun>\n</IdXML>', file=fh)
Пример #15
0
 def create_parser(self):
     return mzid.read(self.filename)
Пример #16
0
    def read_df_from_mzid(self) -> pd.DataFrame:
        """Read mzid to Dataframe."""
        logger.info("Processing mzid file")
        psm_list = []
        with mzid.read(self.path_to_id_file) as reader:
            for spectrum_identification_result in tqdm(reader):
                psm = {}
                flat_dict = dict(
                    self._convert_to_flat_dict(spectrum_identification_result)
                )
                spec_id = (
                    flat_dict["location"]
                    .rsplit("/", 1)[1]
                    .split(".", 1)[0]
                    .replace(",", "_", 1)
                    + ":"
                    + flat_dict["spectrumID"]
                )
                psm["spec_id"] = spec_id
                psm["peptide"] = flat_dict[
                    "SpectrumIdentificationItem_PeptideSequence"
                ]
                psm["peptide_length"] = len(psm["peptide"])
                try:
                    psm["modifications"] = self._get_peprec_modifications(
                        flat_dict["SpectrumIdentificationItem_Modification"]
                    )
                except KeyError:
                    psm["modifications"] = "-"
                psm["charge"] = flat_dict[
                    "SpectrumIdentificationItem_chargeState"
                ]
                psm["protein_list"] = [
                    d["accession"]
                    for d in spectrum_identification_result[
                        "SpectrumIdentificationItem"
                    ][0]["PeptideEvidenceRef"]
                    if "accession" in d.keys()
                ]
                psm["PEAKS:peptideScore"] = flat_dict[
                    "SpectrumIdentificationItem_PEAKS:peptideScore"
                ]
                psm["Label"] = flat_dict[
                    "SpectrumIdentificationItem_PeptideEvidenceRef_isDecoy"
                ]
                psm["Raw file"] = (
                    flat_dict["location"]
                    .rsplit("/", 1)[1]
                    .split(".", 1)[0]
                    .replace(",", "_", 1)
                )
                psm["calculatedMassToCharge"] = flat_dict["SpectrumIdentificationItem_calculatedMassToCharge"]
                psm["experimentalMassToCharge"] = flat_dict["SpectrumIdentificationItem_experimentalMassToCharge"]
                
                #if "retention time" in flat_dict.keys():
                    #psm["observed_retention_time"] = flat_dict["retention time"]
                
                if "inverse reduced ion mobility" in flat_dict.keys():
                    psm["ion_mobility"] = flat_dict["inverse reduced ion mobility"]

                psm_list.append(psm)
                
            df = pd.DataFrame(psm_list)

            df["Label"] = df["Label"].apply(lambda x: -1 if x else 1)
            df["dM"] = df["experimentalMassToCharge"].astype(float) - df["calculatedMassToCharge"].astype(float)
            df["absdM"] = abs(df["dM"])

            return df
Пример #17
0
def test_write(output_path):
    software = mzid_data.software
    spectra_data = mzid_data.spectra_data
    search_database = mzid_data.search_database
    spectrum_identification_list = mzid_data.spectrum_identification_list
    protein_detect_list = mzid_data.protein_detect_list

    proteins = mzid_data.proteins
    peptides = mzid_data.peptides
    peptide_evidence = mzid_data.peptide_evidence

    spectrum_id_protocol = mzid_data.spectrum_id_protocol
    protein_detection_protocol = mzid_data.protein_detection_protocol
    analysis = mzid_data.analysis
    source_file = mzid_data.source_file

    f = MzIdentMLWriter(output_path, close=True)
    with f:
        f.controlled_vocabularies()
        f.provenance(software=software)
        f.register("SpectraData", spectra_data['id'])
        f.register("SearchDatabase", search_database['id'])
        f.register("SpectrumIdentificationList", spectrum_identification_list["id"])
        f.register("SpectrumIdentificationProtocol", spectrum_id_protocol['id'])
        f.register("ProteinDetectionProtocol", protein_detection_protocol['id'])
        f.register("ProteinDetectionList", 1)

        with f.sequence_collection():
            for prot in proteins:
                f.write_db_sequence(**prot)
            for pep in peptides:
                f.write_peptide(**pep)
            for evid in peptide_evidence:
                f.write_peptide_evidence(**evid)

        with f.analysis_collection():
            f.SpectrumIdentification(*analysis).write(f)
            f.ProteinDetection(spectrum_identification_ids_used=[spectrum_identification_list["id"]]).write(f)
        with f.analysis_protocol_collection():
            f.spectrum_identification_protocol(**spectrum_id_protocol)
            f.protein_detection_protocol(**protein_detection_protocol)
        with f.data_collection():
            f.inputs(source_file, search_database, spectra_data)
            with f.analysis_data():
                with f.spectrum_identification_list(id=spectrum_identification_list['id']):
                    for result_ in spectrum_identification_list['identification_results']:
                        result = dict(result_)
                        identifications = result.pop("identifications")
                        result = f.spectrum_identification_result(**result)
                        assert result._context_manager is None
                        with result:
                            assert result._context_manager is not None
                            assert result._is_open
                            for item in identifications:
                                f.write_spectrum_identification_item(**item)

                with f.protein_detection_list(id=protein_detect_list['id'], count=len(
                        protein_detect_list['protein_ambiguity_groups'])):
                    for pag in protein_detect_list['protein_ambiguity_groups']:
                        a = f.protein_ambiguity_group(**pag)
                        with a:
                            pass

    try:
        f.close()
    except OSError:
        pass
    opener = compression.get(output_path)
    # assert opener == compressor
    reader = mzid.read(opener(output_path, 'rb'), read_schema=False)

    def reset():
        reader.reset()
        reader.seek(0)

    n_peptide_evidence = len(peptide_evidence)
    assert n_peptide_evidence == len(list(reader.iterfind("PeptideEvidence")))
    n_spectrum_identification_results = len(spectrum_identification_list['identification_results'])
    reset()
    spectrum_identification_results = list(reader.iterfind("SpectrumIdentificationResult"))
    assert n_spectrum_identification_results == len(spectrum_identification_results)
    assert spectrum_identification_results[0]['scan start time'] != 0
    reset()
    protocol = next(reader.iterfind("SpectrumIdentificationProtocol"))
    mods = protocol['ModificationParams']['SearchModification']
    assert len(mods) == 2
    assert mods[0]['fixedMod']
    assert not mods[1]['fixedMod']
    assert "unknown modification" in mods[1]
    reset()
    is_valid, schema = f.validate()
    assert is_valid, schema.error_log
    reset()
    line = reader.readline()
    assert line.startswith(b"""<?xml version='1.0' encoding='utf-8'?>""")
    reader.close()
    return f
def parser_mzident(filename,
                   score_field,
                   title_field=None,
                   fdr=0.01,
                   larger_score_is_better=False,
                   decoy_string="DECOY",
                   include_decoy=False):
    """
    A general parsing function for mzIdentML files.

    Several exporters of mzIdentML do not report the correct spectrum indexes. X!Tandem, for example,
    uses the spectrum's title as "id" instead of the correct "index=N" format for MGF files. Therefore,
    it is possible to supply the index_field and title_field separately. Later, missing indexes will be
    resolved through the titles.

    :param filename: The path to the mzIdentML file
    :param score_field: The name of the score's field (**Important**: do not supply the accession
                        but only the name)
    :param title_field: The name of the field supplying the spectrum's title (in SpectrumIdentificationResult).
    :param fdr: Target FDR (default 0.01). If set to "2" the original cut-off is used.
    :param larger_score_is_better: Logical indicating whether better scores mean a more reliable
                                   result. Default is False as most search engines report
                                   probabilities
    :param decoy_string: String used to identify decoy proteins.
    :param include_decoy: If set to True decoy hits are also returned.
    :return: A list of PSM objects
    """
    mzid_psms = list()

    # load all PSMs from the file
    with mzid.read(filename) as object_reader:
        with mzid.read(filename) as reader:
            for spec_ref in reader:
                for spec_ident in spec_ref["SpectrumIdentificationItem"]:
                    # filter based on original FDR if set right away
                    if fdr == 2 and not spec_ident["passThreshold"]:
                        continue

                    # only use rank 1 ids
                    if spec_ident["rank"] > 1:
                        continue

                    if score_field not in spec_ident:
                        raise Exception(
                            "Failed to find supplied score field '" +
                            score_field + "' in mzIdentML file.")
                    if title_field is not None and title_field not in spec_ref:
                        raise Exception(
                            "Failed to find supplied title field '" +
                            title_field + "' in mzIdentML file.")

                    mzid_psm = dict()

                    mzid_psm["score"] = spec_ident[score_field]

                    # the index should be used as id
                    if spec_ref["spectrumID"][:6] == "index=":
                        mzid_psm["index"] = int(spec_ref["spectrumID"][6:])
                    elif "scan number(s)" in spec_ref:
                        # TODO: This has only been tested for X!Tandem
                        mzid_psm["index"] = int(spec_ref["scan number(s)"]) - 1
                    else:
                        mzid_psm["index"] = Psm.MISSING_INDEX

                    # spectrum title is optional in mzIdentML
                    if title_field is not None:
                        mzid_psm["title"] = spec_ref[title_field].strip()
                    elif "spectrum title" in spec_ref:
                        mzid_psm["title"] = spec_ref["spectrum title"].strip()

                    # get the sequence in an mzIdentML "secure" way
                    mzid_psm["sequence"] = spec_ident["PeptideSequence"]
                    # TODO: PTMs are stored in peptide["Modification"]

                    peptide_evidence = spec_ident["PeptideEvidenceRef"][0]

                    is_decoy = False
                    if "accession" in peptide_evidence:
                        is_decoy = decoy_string in peptide_evidence[
                            "accession"]
                    if "protein description" in peptide_evidence:
                        is_decoy = is_decoy or decoy_string in peptide_evidence[
                            "protein description"]
                    mzid_psm["is_decoy"] = is_decoy

                    mzid_psms.append(mzid_psm)

    # sort the psms based on probability
    mzid_psms.sort(key=operator.itemgetter('score'),
                   reverse=larger_score_is_better)

    # filter decoys
    filtered_psms = list()
    n_target = 0
    n_decoy = 0

    for mzid_psm in mzid_psms:
        # only filter if the FDR wasn't set to 2
        if fdr != 2:
            if mzid_psm["is_decoy"]:
                n_decoy += 1
            else:
                n_target += 1

            current_fdr = n_decoy * 2 / (n_target + n_decoy)

            if current_fdr > fdr:
                break

        # convert the psm
        if not mzid_psm["is_decoy"] or include_decoy:
            filtered_psms.append(
                Psm(mzid_psm["index"],
                    mzid_psm["sequence"],
                    mzid_psm["title"],
                    is_decoy=mzid_psm["is_decoy"]))

    return filtered_psms