def __iter__(self): if self.is_debug: fname = self.protxml + '.dump' logging.debug('Dumping protxml reads into ' + fname) self.debug_file = open(fname, 'w') self.debug_file.write('{\n') for event, elem in etree.iterparse(self.protxml, events=('end', 'start-ns')): if event == 'start-ns': self.nsmap.update({elem}) if event == 'end': if elem.tag == parse.fixtag('', 'protein_group', self.nsmap): group = parse_protein_group(elem, self.nsmap) yield group if self.is_debug: pprint(group, stream=self.debug_file) self.debug_file.write(',\n') elem.clear() elif elem.tag == parse.fixtag('', 'proteinprophet_details', self.nsmap): self.distribution = parse_protein_probabilities(elem, self.nsmap) if self.is_debug: fname = self.protxml + '.distribution.dump' pprint(self.distribution, open(fname, 'w')) elem.clear() if self.is_debug: self.debug_file.write('}\n') self.debug_file.close()
def read_protxml(protxml): nsmap = {} distribution = {} protein_groups = {} for event, elem in etree.iterparse(protxml, events=('end', 'start-ns')): if event == 'start-ns': nsmap.update({elem}) if event == 'end': if elem.tag == parse.fixtag('', 'protein_group', nsmap): group = parse_protein_group(elem, nsmap) protein_groups[group['group_number']] = group elem.clear() elif elem.tag == parse.fixtag('', 'proteinprophet_details', nsmap): distribution = parse_protein_probabilities(elem, nsmap) elem.clear() return protein_groups, distribution
def parse_scan(scan_elem, nsmap): scan = parse.parse_attrib(scan_elem) scan['matches'] = [] tag = lambda tag_id: parse.fixtag('', tag_id, nsmap) for search_elem in scan_elem.findall(parse.fixtag('', "search_result", nsmap)): search_hit_elem = search_elem[0] match = parse.parse_attrib(search_hit_elem) match['modified_sequence'] = match['peptide'] match['other_seqids'] = [] for alt_protein in search_hit_elem.findall(parse.fixtag('', 'alternative_protein', nsmap)): match['other_seqids'].append(alt_protein.attrib['protein']) match['modifications'] = [] for modified_elem in search_hit_elem.findall(parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) match['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) attr['i'] = attr['position'] - 1 del attr['position'] match['modifications'].append(attr) for score_elem in search_hit_elem.findall(tag('search_score')): match.update(parse.parse_name_value(score_elem)) for analysis_elem in search_hit_elem.find(parse.fixtag('', 'analysis_result', nsmap)): if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result', nsmap): match.update(parse.parse_attrib(analysis_elem)) for param_elem in analysis_elem[0]: match.update(parse.parse_name_value(param_elem)) scan['matches'].append(match) return scan
def parse_peptide_probabilities(elem, nsmap): # try with error_point error_points = elem.findall(parse.fixtag('', 'error_point', nsmap)) if len(error_points) == 0: charge = 0 for charge_elem in elem.findall(parse.fixtag('', 'roc_error_data', nsmap)): if charge_elem.attrib['charge'] == 'all': error_points = charge_elem.findall(parse.fixtag('', 'error_point', nsmap)) break probs = [] for elem in error_points: attrib = parse.parse_attrib(elem) probs.append({ 'error': attrib['error'], 'prob': attrib['min_prob'], }) probs.sort(key=lambda d:d['error']) return probs
def __iter__(self): if self.is_debug: fname = self.pepxml + '.dump' logging.debug('Dumping pepxml reads into ' + fname) self.debug_file = open(fname, 'w') self.debug_file.write('[\n') for event, elem in etree.iterparse(self.pepxml, events=('start', 'end', 'start-ns')): if event == 'start-ns': self.nsmap.update({elem}) elif event == 'start': if elem.tag == parse.fixtag('', 'msms_run_summary', self.nsmap): fname = elem.attrib['base_name'] self.source_names.append(fname) self.i_source = len(self.source_names) - 1 elif event == 'end': if elem.tag == parse.fixtag('', 'spectrum_query', self.nsmap): scan = parse_scan(elem, self.nsmap) for match in scan['matches']: fpe = probability_to_error(self.distribution, match['probability']) if fpe is None: print("WTF", match['probability'], self.distribution) else: match['fpe'] = fpe if self.i_source is not None: scan['source'] = self.source_names[self.i_source] if self.is_debug: pprint(scan, stream=self.debug_file) self.debug_file.write(',\n') yield scan elem.clear() elif elem.tag == parse.fixtag('', 'peptideprophet_summary', self.nsmap): self.distribution = parse_peptide_probabilities(elem, self.nsmap) if self.distribution[0]['prob'] < 1.0: self.distribution.insert(0, {'prob':1.0, 'error':0.0}) if self.distribution[-1]['prob'] > 0.0: self.distribution.append({'prob':0.0, 'error':1.0}) if self.is_debug: fname = self.pepxml + '.distribution.dump' pprint(self.distribution, open(fname, 'w')) elem.clear() if self.is_debug: self.debug_file.write(']\n') self.debug_file.close()
def parse_protein_probabilities(elem, nsmap): probs = [] for data_point in elem.findall(parse.fixtag('', 'protein_summary_data_filter', nsmap)): attrib = parse.parse_attrib(data_point) probs.append({ 'error': attrib['false_positive_error_rate'], 'prob': attrib['min_probability'], }) probs.sort(key=lambda d:d['error']) return probs
def parse_protein_group(elem, nsmap): group = parse.parse_attrib(elem) group['proteins'] = [] for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)): protein = parse.parse_attrib(protein_elem) protein['group_number'] = group['group_number'] annotation_elem = protein_elem.find(parse.fixtag('', 'annotation', nsmap)) if annotation_elem is not None: protein['description'] = annotation_elem.attrib['protein_description'] protein['other_seqids'] = [] for alt_protein in protein_elem.findall(parse.fixtag('', 'indistinguishable_protein', nsmap)): protein['other_seqids'].append(alt_protein.attrib['protein_name']) protein['other_seqids'] = protein['other_seqids'] protein['protein_name'] = protein['protein_name'] protein['peptides'] = [] n_unique_peptide = 0 for peptide_elem in protein_elem.findall(parse.fixtag('', 'peptide', nsmap)): peptide = parse.parse_attrib(peptide_elem) protein['peptides'].append(peptide) peptide['modifications'] = [] peptide['modified_sequence'] = peptide['peptide_sequence'] for modified_elem in peptide_elem.findall(parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) peptide['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) peptide['modifications'].append(attr) group['proteins'].append(protein) return group
def iter(self): for event, elem in etree.iterparse(self.pepxml, events=('start', 'end', 'start-ns')): if event == 'start-ns': self.nsmap.update({elem}) elif event == 'start': if elem.tag == parse.fixtag('', 'msms_run_summary', self.nsmap): fname = elem.attrib['base_name'] self.source_names.append(fname) self.i_source = len(self.source_names) - 1 elif event == 'end': if elem.tag == parse.fixtag('', 'spectrum_query', self.nsmap): scan = parse_scan(elem, self.nsmap) if self.i_source is not None: scan['source'] = self.source_names[self.i_source] if self.prob_cutoff is None or scan['probability'] >= self.prob_cutoff: yield scan elem.clear() elif elem.tag == parse.fixtag('', 'peptideprophet_summary', self.nsmap): self.probs = parse_peptide_probabilities(elem, self.nsmap) if self.prob_cutoff is None and self.error_cutoff is not None: self.prob_cutoff = error_to_probability(self.probs, self.prob_cutoff) elem.clear()
def read_pepxml(pepxml): nsmap = {} probs = [] scan_sources = [] for event, elem in etree.iterparse(pepxml, events=('start', 'end', 'start-ns')): if event == 'start-ns': nsmap.update({elem}) elif event == 'start': if elem.tag == parse.fixtag('', 'msms_run_summary', nsmap): scan_source = { 'scans': [], 'filename': elem.attrib['base_name'], } scan_sources.append(scan_source) elif event == 'end': if elem.tag == parse.fixtag('', 'spectrum_query', nsmap): scan = parse_scan(elem, nsmap) scan_source['scans'].append(scan) elem.clear() elif elem.tag == parse.fixtag('', 'peptideprophet_summary', nsmap): probs = parse_peptide_probabilities(elem, nsmap) elem.clear() return scan_sources, probs
def parse_protein_group(elem, nsmap): group = parse.parse_attrib(elem) group['proteins'] = [] for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)): protein = parse.parse_attrib(protein_elem) protein['group_number'] = group['group_number'] for parameter_elem in protein_elem.findall( parse.fixtag('', 'parameter', nsmap)): key = parameter_elem.attrib['name'] val = parameter_elem.attrib['value'] protein[key] = val annotation_elem = protein_elem.find( parse.fixtag('', 'annotation', nsmap)) if annotation_elem is not None: protein['description'] = annotation_elem.attrib[ 'protein_description'] protein['other_seqids'] = [] for alt_protein in protein_elem.findall( parse.fixtag('', 'indistinguishable_protein', nsmap)): protein['other_seqids'].append(alt_protein.attrib['protein_name']) protein['other_seqids'] = protein['other_seqids'] protein['protein_name'] = protein['protein_name'] protein['peptides'] = [] n_unique_peptide = 0 for peptide_elem in protein_elem.findall( parse.fixtag('', 'peptide', nsmap)): peptide = parse.parse_attrib(peptide_elem) protein['peptides'].append(peptide) peptide['modifications'] = [] peptide['modified_sequence'] = peptide['peptide_sequence'] for modified_elem in peptide_elem.findall( parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) peptide['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall( parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) peptide['modifications'].append(attr) group['proteins'].append(protein) return group
def search_tag(self, tag): return parse.fixtag('', tag, self.nsmap)