def parse_mz_id(self): data = mzid.read(self.path) max_rank = 2 for d in data: title = d['spectrum title'] scan_id = parse_scan_id(title) fragments = [] len_frag = len(d['SpectrumIdentificationItem']) pos = 1 while(pos <= min(max_rank, len_frag)): for fragmentation in d['SpectrumIdentificationItem'][pos - 1]['IonType']: # 0 because just first rank for f in fragmentation['FragmentArray']: if f['measure_ref'] == 'm_mz': mz = f['values'] elif f['measure_ref'] == 'm_error': error = f['values'] else: pass fragments.append(Fragment(name=fragmentation['name'], indice=fragmentation['index'], charge=fragmentation['charge'], mz=mz, error=error, scanid=parse_scan_id(title))) if scan_id in self.scan1.data: self.scan1.data[scan_id][pos] = fragments else: self.scan1.data[scan_id] = {} self.scan1.data[scan_id][pos] = fragments pos += 1
def parse_mz_id(self): """ reading mzid saving every spectrum identification (but just rank 1) returns: None """ data = mzid.read(self.path) for d in data: title = parse_scan_id(d['spectrum title']) ident = {} len_ranks = len(d['SpectrumIdentificationItem']) if len_ranks > 1: for i in [0, 1]: identification = d['SpectrumIdentificationItem'][i] # 0 because just first rank peptide_ref = identification['peptide_ref'] ident[i + 1] = Identification(mzid_info_lvl_fragmentation=identification['IonType'], peptide_ref=peptide_ref, title=title) else: identification = d['SpectrumIdentificationItem'][0] # 0 because just first rank peptide_ref = identification['peptide_ref'] ident[1] = Identification(mzid_info_lvl_fragmentation=identification['IonType'], peptide_ref=peptide_ref, title=title) self.identifications.append(ident)
def testPyteomicsParsing(self): reader = mzid.read(self.testfile) n_decoy = 0 n_target = 0 for spec_ref in reader: for spec_id in spec_ref["SpectrumIdentificationItem"]: if spec_id["passThreshold"]: n_target += 1 else: n_decoy += 1 # spectrumID # spectrum title # SpectrumIdentificationItem (list) # - dict # - Mascot:score # - Mascot:identify threshold # - Scaffold:Peptide Probability # - chargeState # - experimentalMassToCharge # - passThreshold (bool) # - peptide_ref (str = sequence) # - rank (int) self.assertEqual(6573, n_target) self.assertEqual(14095, n_decoy)
def get_PSM_mzid(psm_file): ''' :param psm_file: mzid file :return: dictionairy of parsed mzid file, suitable for proBAMconvert ''' with mzid.read(psm_file) as PSM: psm_hash=[] accession_hash=_get_accessions_(psm_file) for row in PSM: temp_hash={"assumed_charge":row['SpectrumIdentificationItem'][0]['chargeState'],"spectrum":row['spectrumID'],"search_hit":[]} for psm in row["SpectrumIdentificationItem"]: if "Modification" in psm.keys(): mod_list=psm['Modification'] else: mod_list=[] proteins=[] massdiff=_cal_massdiff_(psm['experimentalMassToCharge'],psm['calculatedMassToCharge']) for protein in psm["PeptideEvidenceRef"]: proteins.append({"protein":accession_hash[protein['peptideEvidence_ref']]}) mod_peptide=_get_mod_peptide_sequence_(psm['peptide_ref'],mod_list) modifications=_get_peptide_modifications_(mod_list) score=_get_score_(psm) temp_hash['search_hit'].append({"hit_rank":psm['rank'],"modifications":modifications, "modified_peptide":mod_peptide,"peptide":psm['peptide_ref'], "search_score":{"score":_get_score_(psm),"evalue":_get_evalue_(psm)}, "proteins":proteins,"num_missed_cleavages":"0", "massdiff":massdiff}) psm_hash.append(temp_hash) return psm_hash
def load(self, f, source_statistic_names): dbg_limit = 10 counter = 0 title_pat = '^.*\.([0-9]+)\.[0-9]$' psms = [] for identification_result in read(f, retrieve_refs=True): scan_id = identification_result['spectrumID'] scan_num = None spectrum_title = identification_result.get('spectrum title',None) if spectrum_title: m = re.match(title_pat,spectrum_title) if m: scan_num = int(m.groups()[0]) scan_source = self.scan_source_manager.match_by_name(identification_result['name']) if counter < dbg_limit: print >> sys.stdout, "MzIdLoader %s" % identification_result ## source, index=None, number=None, id=None, base_peak_mz=None) scan_reference = ScanReference(id=scan_id, number=scan_num, source=scan_source) if counter < dbg_limit: print >> sys.stdout, "MzIdLoader scan %s\t%s\t%s\t%s\t%s" % ( scan_reference.id, scan_reference.index, scan_reference.number, scan_reference.base_peak_mz, scan_reference.source) counter += 1 for identification_item in identification_result['SpectrumIdentificationItem']: psm = self._identification_to_psm(identification_item, scan_reference, source_statistic_names) psms.append(psm) return psms
def test_structure_normalization(self): gen = read('mzid_snippet.xml').iterfind("SpectraData") datum = next(gen) index = aux.cvquery(datum) assert index['MS:1000768'] == 'Thermo nativeID format' datum = next(gen) index = aux.cvquery(datum) assert index['MS:1000774'] == 'multiple peak list nativeID format'
def load(self, f, source_statistic_names): psms = [] for identification_result in read(f, retrieve_refs=True): scan_id = identification_result['spectrumID'] scan_source = self.scan_source_manager.match_by_name(identification_result['name']) scan_reference = ScanReference(id=scan_id, source=scan_source) for identification_item in identification_result['SpectrumIdentificationItem']: psm = self._identification_to_psm(identification_item, scan_reference, source_statistic_names) psms.append(psm) return psms
def mzidentml( fil ): #This function generates a list of all the peptide identifications from a mzIdentML file from pyteomics import mzid with mzid.read(fil, read_schema=False) as f: #reads the mzid file sequences = [ item['SpectrumIdentificationItem'][0]['PeptideSequence'] for item in f ] #for each Spectrum Identification Item gets the return sequences # peptide sequences.
def get_indices_pyteomics(path_to_mzid): """ Given a dictionary with the .mzid file, go through every SpectrumIdentificationResult and match the mgf TITLE to the initial part of the Percolator index. Save these correspondences in a dictionary. https://github.com/percolator/percolator/issues/147 """ index_map = {} for a in mzid.read(path_to_mzid): index_map[a['spectrumID'].split('=')[1] + '_' + str( a['SpectrumIdentificationItem'][0]['rank'])] = a['spectrum title'] return index_map
def load_mzid(fn, qval=0.01): psms = [] specids = [0] psmReader = mzid.read(fn) for psm in psmReader: if 'SpectrumIdentificationItem' in psm: try: specids.append( int(psm['scan number(s)'])) except KeyError: specids.append( int( psm['spectrumID'].split('=')[-1] )) else: pass for match in psm['SpectrumIdentificationItem']: if match['MS-GF:QValue'] < qval and match['rank'] == 1 and match['IsotopeError'] == 0 and 2 <= match['chargeState'] <= 4: dm = match['experimentalMassToCharge'] - match['calculatedMassToCharge'] dm = dm * 1e6 / match['calculatedMassToCharge'] psms.append(dm) return numpy.array(psms), max(specids)
def mzid2pin(fn): db = _dbrefs(fn) psmReader = mzid.read(fn) sys.stdout = open(fn + '.idxml', 'wb') print r'''<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="http://open-ms.sourceforge.net/XSL/IdXML.xsl" ?> <IdXML version="1.2" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/SCHEMAS/IdXML_1_2.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <SearchParameters id="SP_0" db="" db_version="" taxonomy="" mass_type="monoisotopic" charges="" enzyme="Trypsin/P" missed_cleavages="3" precursor_peak_tolerance="10" peak_mass_tolerance="20" ></SearchParameters> <IdentificationRun date="" search_engine="" search_engine_version="" search_parameters_ref="SP_0" > ''' for psm in psmReader: if psm.has_key('SpectrumIdentificationItem'): specId = psm['spectrumID'] specTitle = psm['spectrum title'].split(':') featMZ = float(specTitle[1]) rt = specTitle[2][:-1] for match in psm['SpectrumIdentificationItem']: if match['MS-GF:QValue'] > 0.01: # if match['MS-GF:PepQValue'] > 0.01: continue z = match['chargeState'] if not 2 <= z <= 4: continue expMZ = match['experimentalMassToCharge'] mz = match['calculatedMassToCharge'] # - (expMZ - featMZ) score = match['MS-GF:EValue'] flag, enzN, enzC, prot = db[match['PeptideEvidenceRef'][0]['peptideEvidence_ref']] pepref = match['peptide_ref'] bseq, seq = db[pepref] print ''' <PeptideIdentification score_type="" higher_score_better="false" significance_threshold="0" MZ="%f" RT="%s" > <PeptideHit score="%f" sequence="%s" charge="%d" aa_before="%s" aa_after="%s"></PeptideHit> </PeptideIdentification> ''' % (mz, rt, score, seq, z, enzN, enzC) print '\n</IdentificationRun>\n</IdXML>'
def load_mzid(fn, qval=0.001): from pprint import pprint psms = [] specids = [0] psmReader = mzid.read(fn) for psm in psmReader: if psm.has_key('SpectrumIdentificationItem'): try: specids.append( int(psm['scan number(s)'])) except KeyError: specids.append( int( psm['spectrumID'].split('=')[-1] )) else: pass for match in psm['SpectrumIdentificationItem']: if match['MS-GF:QValue'] < qval and match['rank'] == 1 and match['IsotopeError'] == 0 and 2 <= match['chargeState'] <= 4: dm = match['experimentalMassToCharge'] - match['calculatedMassToCharge'] dm = dm * 1e6 / match['calculatedMassToCharge'] psms.append(dm) return numpy.array(psms), max(specids)
def get_PSM_mzid(psm_file): ''' :param psm_file: mzid file :return: dictionairy of parsed mzid file, suitable for proBAMconvert ''' with mzid.read(psm_file) as PSM: psm_hash=[] accession_hash=_get_accessions_(psm_file) mod_hash=_get_modification_(psm_file) sequence_hash=_get_peptide_sequence_hash(psm_file) spectraData_ref={} c=0 for row in PSM: if 'spectraData_ref' in row: if row['spectraData_ref'] not in spectraData_ref: spectraData_ref[row['spectraData_ref']]=c c+=1 row['spectrumID']='ms_run['+str(spectraData_ref[row['spectraData_ref']])+']:'+row['spectrumID'] temp_hash={"assumed_charge":row['SpectrumIdentificationItem'][0]['chargeState'],"spectrum":row['spectrumID'],"search_hit":[]} for psm in row["SpectrumIdentificationItem"]: if psm['passThreshold']==True: proteins=[] massdiff=_cal_massdiff_(psm['experimentalMassToCharge'],psm['calculatedMassToCharge']) for protein in psm["PeptideEvidenceRef"]: proteins.append({"protein":accession_hash[protein['peptideEvidence_ref']]}) temp_hash['search_hit'].append({"hit_rank":psm['rank'],"modifications":mod_hash[psm['peptide_ref']], "calc_neutral_pep_mass": psm['experimentalMassToCharge'], "precursor_neutral_mass": psm['calculatedMassToCharge'], "peptide":sequence_hash[psm['peptide_ref']], "search_score":{"score":_get_score_(psm),"evalue":_get_evalue_(psm)}, "proteins":proteins,"num_missed_cleavages":"0", "massdiff":massdiff}) psm_hash.append(temp_hash) PSM.close() del mod_hash del sequence_hash del accession_hash return psm_hash
def mzid2pin(fn, max_charge=7): fh = open(fn + '.idxml', 'w') db = _dbrefs(fn) psmReader = mzid.read(fn) print(r'''<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="http://open-ms.sourceforge.net/XSL/IdXML.xsl" ?> <IdXML version="1.2" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/SCHEMAS/IdXML_1_2.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <SearchParameters id="SP_0" db="" db_version="" taxonomy="" mass_type="monoisotopic" charges="" enzyme="Trypsin/P" missed_cleavages="3" precursor_peak_tolerance="10" peak_mass_tolerance="20" ></SearchParameters> <IdentificationRun date="" search_engine="" search_engine_version="" search_parameters_ref="SP_0" > ''', file=fh) for psm in psmReader: if 'SpectrumIdentificationItem' not in psm: continue rt = psm['scan start time'] for match in psm['SpectrumIdentificationItem']: if match['MS-GF:QValue'] > 0.01: # if match['MS-GF:PepQValue'] > 0.01: continue z = match['chargeState'] if not 2 <= z <= max_charge: continue expMZ = match['experimentalMassToCharge'] mz = match['calculatedMassToCharge'] # - (expMZ - featMZ) score = match['MS-GF:EValue'] flag, enzN, enzC, prot = db[match['PeptideEvidenceRef'][0] ['peptideEvidence_ref']] pepref = match['peptide_ref'] bseq, seq = db[pepref] print( ''' <PeptideIdentification score_type="" higher_score_better="false" significance_threshold="0" MZ="%f" RT="%s" > <PeptideHit score="%f" sequence="%s" charge="%d" aa_before="%s" aa_after="%s"></PeptideHit> </PeptideIdentification> ''' % (mz, rt, score, seq, z, enzN, enzC), file=fh) print('\n</IdentificationRun>\n</IdXML>', file=fh)
def create_parser(self): return mzid.read(self.filename)
def read_df_from_mzid(self) -> pd.DataFrame: """Read mzid to Dataframe.""" logger.info("Processing mzid file") psm_list = [] with mzid.read(self.path_to_id_file) as reader: for spectrum_identification_result in tqdm(reader): psm = {} flat_dict = dict( self._convert_to_flat_dict(spectrum_identification_result) ) spec_id = ( flat_dict["location"] .rsplit("/", 1)[1] .split(".", 1)[0] .replace(",", "_", 1) + ":" + flat_dict["spectrumID"] ) psm["spec_id"] = spec_id psm["peptide"] = flat_dict[ "SpectrumIdentificationItem_PeptideSequence" ] psm["peptide_length"] = len(psm["peptide"]) try: psm["modifications"] = self._get_peprec_modifications( flat_dict["SpectrumIdentificationItem_Modification"] ) except KeyError: psm["modifications"] = "-" psm["charge"] = flat_dict[ "SpectrumIdentificationItem_chargeState" ] psm["protein_list"] = [ d["accession"] for d in spectrum_identification_result[ "SpectrumIdentificationItem" ][0]["PeptideEvidenceRef"] if "accession" in d.keys() ] psm["PEAKS:peptideScore"] = flat_dict[ "SpectrumIdentificationItem_PEAKS:peptideScore" ] psm["Label"] = flat_dict[ "SpectrumIdentificationItem_PeptideEvidenceRef_isDecoy" ] psm["Raw file"] = ( flat_dict["location"] .rsplit("/", 1)[1] .split(".", 1)[0] .replace(",", "_", 1) ) psm["calculatedMassToCharge"] = flat_dict["SpectrumIdentificationItem_calculatedMassToCharge"] psm["experimentalMassToCharge"] = flat_dict["SpectrumIdentificationItem_experimentalMassToCharge"] #if "retention time" in flat_dict.keys(): #psm["observed_retention_time"] = flat_dict["retention time"] if "inverse reduced ion mobility" in flat_dict.keys(): psm["ion_mobility"] = flat_dict["inverse reduced ion mobility"] psm_list.append(psm) df = pd.DataFrame(psm_list) df["Label"] = df["Label"].apply(lambda x: -1 if x else 1) df["dM"] = df["experimentalMassToCharge"].astype(float) - df["calculatedMassToCharge"].astype(float) df["absdM"] = abs(df["dM"]) return df
def test_write(output_path): software = mzid_data.software spectra_data = mzid_data.spectra_data search_database = mzid_data.search_database spectrum_identification_list = mzid_data.spectrum_identification_list protein_detect_list = mzid_data.protein_detect_list proteins = mzid_data.proteins peptides = mzid_data.peptides peptide_evidence = mzid_data.peptide_evidence spectrum_id_protocol = mzid_data.spectrum_id_protocol protein_detection_protocol = mzid_data.protein_detection_protocol analysis = mzid_data.analysis source_file = mzid_data.source_file f = MzIdentMLWriter(output_path, close=True) with f: f.controlled_vocabularies() f.provenance(software=software) f.register("SpectraData", spectra_data['id']) f.register("SearchDatabase", search_database['id']) f.register("SpectrumIdentificationList", spectrum_identification_list["id"]) f.register("SpectrumIdentificationProtocol", spectrum_id_protocol['id']) f.register("ProteinDetectionProtocol", protein_detection_protocol['id']) f.register("ProteinDetectionList", 1) with f.sequence_collection(): for prot in proteins: f.write_db_sequence(**prot) for pep in peptides: f.write_peptide(**pep) for evid in peptide_evidence: f.write_peptide_evidence(**evid) with f.analysis_collection(): f.SpectrumIdentification(*analysis).write(f) f.ProteinDetection(spectrum_identification_ids_used=[spectrum_identification_list["id"]]).write(f) with f.analysis_protocol_collection(): f.spectrum_identification_protocol(**spectrum_id_protocol) f.protein_detection_protocol(**protein_detection_protocol) with f.data_collection(): f.inputs(source_file, search_database, spectra_data) with f.analysis_data(): with f.spectrum_identification_list(id=spectrum_identification_list['id']): for result_ in spectrum_identification_list['identification_results']: result = dict(result_) identifications = result.pop("identifications") result = f.spectrum_identification_result(**result) assert result._context_manager is None with result: assert result._context_manager is not None assert result._is_open for item in identifications: f.write_spectrum_identification_item(**item) with f.protein_detection_list(id=protein_detect_list['id'], count=len( protein_detect_list['protein_ambiguity_groups'])): for pag in protein_detect_list['protein_ambiguity_groups']: a = f.protein_ambiguity_group(**pag) with a: pass try: f.close() except OSError: pass opener = compression.get(output_path) # assert opener == compressor reader = mzid.read(opener(output_path, 'rb'), read_schema=False) def reset(): reader.reset() reader.seek(0) n_peptide_evidence = len(peptide_evidence) assert n_peptide_evidence == len(list(reader.iterfind("PeptideEvidence"))) n_spectrum_identification_results = len(spectrum_identification_list['identification_results']) reset() spectrum_identification_results = list(reader.iterfind("SpectrumIdentificationResult")) assert n_spectrum_identification_results == len(spectrum_identification_results) assert spectrum_identification_results[0]['scan start time'] != 0 reset() protocol = next(reader.iterfind("SpectrumIdentificationProtocol")) mods = protocol['ModificationParams']['SearchModification'] assert len(mods) == 2 assert mods[0]['fixedMod'] assert not mods[1]['fixedMod'] assert "unknown modification" in mods[1] reset() is_valid, schema = f.validate() assert is_valid, schema.error_log reset() line = reader.readline() assert line.startswith(b"""<?xml version='1.0' encoding='utf-8'?>""") reader.close() return f
def parser_mzident(filename, score_field, title_field=None, fdr=0.01, larger_score_is_better=False, decoy_string="DECOY", include_decoy=False): """ A general parsing function for mzIdentML files. Several exporters of mzIdentML do not report the correct spectrum indexes. X!Tandem, for example, uses the spectrum's title as "id" instead of the correct "index=N" format for MGF files. Therefore, it is possible to supply the index_field and title_field separately. Later, missing indexes will be resolved through the titles. :param filename: The path to the mzIdentML file :param score_field: The name of the score's field (**Important**: do not supply the accession but only the name) :param title_field: The name of the field supplying the spectrum's title (in SpectrumIdentificationResult). :param fdr: Target FDR (default 0.01). If set to "2" the original cut-off is used. :param larger_score_is_better: Logical indicating whether better scores mean a more reliable result. Default is False as most search engines report probabilities :param decoy_string: String used to identify decoy proteins. :param include_decoy: If set to True decoy hits are also returned. :return: A list of PSM objects """ mzid_psms = list() # load all PSMs from the file with mzid.read(filename) as object_reader: with mzid.read(filename) as reader: for spec_ref in reader: for spec_ident in spec_ref["SpectrumIdentificationItem"]: # filter based on original FDR if set right away if fdr == 2 and not spec_ident["passThreshold"]: continue # only use rank 1 ids if spec_ident["rank"] > 1: continue if score_field not in spec_ident: raise Exception( "Failed to find supplied score field '" + score_field + "' in mzIdentML file.") if title_field is not None and title_field not in spec_ref: raise Exception( "Failed to find supplied title field '" + title_field + "' in mzIdentML file.") mzid_psm = dict() mzid_psm["score"] = spec_ident[score_field] # the index should be used as id if spec_ref["spectrumID"][:6] == "index=": mzid_psm["index"] = int(spec_ref["spectrumID"][6:]) elif "scan number(s)" in spec_ref: # TODO: This has only been tested for X!Tandem mzid_psm["index"] = int(spec_ref["scan number(s)"]) - 1 else: mzid_psm["index"] = Psm.MISSING_INDEX # spectrum title is optional in mzIdentML if title_field is not None: mzid_psm["title"] = spec_ref[title_field].strip() elif "spectrum title" in spec_ref: mzid_psm["title"] = spec_ref["spectrum title"].strip() # get the sequence in an mzIdentML "secure" way mzid_psm["sequence"] = spec_ident["PeptideSequence"] # TODO: PTMs are stored in peptide["Modification"] peptide_evidence = spec_ident["PeptideEvidenceRef"][0] is_decoy = False if "accession" in peptide_evidence: is_decoy = decoy_string in peptide_evidence[ "accession"] if "protein description" in peptide_evidence: is_decoy = is_decoy or decoy_string in peptide_evidence[ "protein description"] mzid_psm["is_decoy"] = is_decoy mzid_psms.append(mzid_psm) # sort the psms based on probability mzid_psms.sort(key=operator.itemgetter('score'), reverse=larger_score_is_better) # filter decoys filtered_psms = list() n_target = 0 n_decoy = 0 for mzid_psm in mzid_psms: # only filter if the FDR wasn't set to 2 if fdr != 2: if mzid_psm["is_decoy"]: n_decoy += 1 else: n_target += 1 current_fdr = n_decoy * 2 / (n_target + n_decoy) if current_fdr > fdr: break # convert the psm if not mzid_psm["is_decoy"] or include_decoy: filtered_psms.append( Psm(mzid_psm["index"], mzid_psm["sequence"], mzid_psm["title"], is_decoy=mzid_psm["is_decoy"])) return filtered_psms