Exemplo n.º 1
0
 def __iter__(self):
   if self.is_debug:
     fname = self.protxml + '.dump'
     logging.debug('Dumping protxml reads into ' + fname)
     self.debug_file = open(fname, 'w')
     self.debug_file.write('{\n')
   for event, elem in etree.iterparse(self.protxml, events=('end', 'start-ns')):
     if event == 'start-ns':
       self.nsmap.update({elem})
     if event == 'end':
       if elem.tag == parse.fixtag('', 'protein_group', self.nsmap):
         group = parse_protein_group(elem, self.nsmap)
         yield group
         if self.is_debug:
           pprint(group, stream=self.debug_file)
           self.debug_file.write(',\n')
         elem.clear()
       elif elem.tag == parse.fixtag('', 'proteinprophet_details', self.nsmap):
         self.distribution = parse_protein_probabilities(elem, self.nsmap)
         if self.is_debug:
           fname = self.protxml + '.distribution.dump'
           pprint(self.distribution, open(fname, 'w'))
         elem.clear()
   if self.is_debug:
     self.debug_file.write('}\n')
     self.debug_file.close()
Exemplo n.º 2
0
def read_protxml(protxml):
  nsmap = {}
  distribution = {}
  protein_groups = {}
  for event, elem in etree.iterparse(protxml, events=('end', 'start-ns')):
    if event == 'start-ns':
      nsmap.update({elem})
    if event == 'end':
      if elem.tag == parse.fixtag('', 'protein_group', nsmap):
        group = parse_protein_group(elem, nsmap)
        protein_groups[group['group_number']] = group
        elem.clear()
      elif elem.tag == parse.fixtag('', 'proteinprophet_details', nsmap):
        distribution = parse_protein_probabilities(elem, nsmap)
        elem.clear()
  return protein_groups, distribution
Exemplo n.º 3
0
def parse_scan(scan_elem, nsmap):
  scan = parse.parse_attrib(scan_elem)
  scan['matches'] = []
  tag = lambda tag_id: parse.fixtag('', tag_id, nsmap)
  for search_elem in scan_elem.findall(parse.fixtag('', "search_result", nsmap)):
    search_hit_elem = search_elem[0] 
    match = parse.parse_attrib(search_hit_elem)
    match['modified_sequence'] = match['peptide']

    match['other_seqids'] = []
    for alt_protein in search_hit_elem.findall(parse.fixtag('', 'alternative_protein', nsmap)):
      match['other_seqids'].append(alt_protein.attrib['protein'])

    match['modifications'] = []
    for modified_elem in search_hit_elem.findall(parse.fixtag('', 'modification_info', nsmap)):
      attr = parse.parse_attrib(modified_elem)
      match['modified_sequence'] = attr['modified_peptide']
      for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
        attr = parse.parse_attrib(modification_elem)
        attr['i'] = attr['position'] - 1
        del attr['position']
        match['modifications'].append(attr)

    for score_elem in search_hit_elem.findall(tag('search_score')):
      match.update(parse.parse_name_value(score_elem))

    for analysis_elem in search_hit_elem.find(parse.fixtag('', 'analysis_result', nsmap)):
      if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result', nsmap):
        match.update(parse.parse_attrib(analysis_elem))
        for param_elem in analysis_elem[0]:
          match.update(parse.parse_name_value(param_elem))

    scan['matches'].append(match)
  return scan
Exemplo n.º 4
0
def parse_peptide_probabilities(elem, nsmap):
  # try with error_point
  error_points = elem.findall(parse.fixtag('', 'error_point', nsmap))
  if len(error_points) == 0:
    charge = 0
    for charge_elem in elem.findall(parse.fixtag('', 'roc_error_data', nsmap)):
      if charge_elem.attrib['charge'] == 'all':
        error_points = charge_elem.findall(parse.fixtag('', 'error_point', nsmap))
        break
  probs = []
  for elem in error_points:
      attrib = parse.parse_attrib(elem)
      probs.append({
        'error': attrib['error'],
        'prob': attrib['min_prob'],
      })
  probs.sort(key=lambda d:d['error'])
  return probs
Exemplo n.º 5
0
 def __iter__(self):
   if self.is_debug:
     fname = self.pepxml + '.dump'
     logging.debug('Dumping pepxml reads into ' + fname)
     self.debug_file = open(fname, 'w')
     self.debug_file.write('[\n')
   for event, elem in etree.iterparse(self.pepxml, events=('start', 'end', 'start-ns')):
     if event == 'start-ns':
       self.nsmap.update({elem})
     elif event == 'start':
       if elem.tag == parse.fixtag('', 'msms_run_summary', self.nsmap):
         fname = elem.attrib['base_name']
         self.source_names.append(fname)
         self.i_source = len(self.source_names) - 1
     elif event == 'end':
       if elem.tag == parse.fixtag('', 'spectrum_query', self.nsmap):
         scan = parse_scan(elem, self.nsmap)
         for match in scan['matches']:
           fpe = probability_to_error(self.distribution, match['probability'])
           if fpe is None:
             print("WTF", match['probability'], self.distribution)
           else:
             match['fpe'] = fpe
         if self.i_source is not None:
           scan['source'] = self.source_names[self.i_source]
         if self.is_debug:
           pprint(scan, stream=self.debug_file)
           self.debug_file.write(',\n')
         yield scan
         elem.clear()
       elif elem.tag == parse.fixtag('', 'peptideprophet_summary', self.nsmap):
         self.distribution = parse_peptide_probabilities(elem, self.nsmap)
         if self.distribution[0]['prob'] < 1.0:
           self.distribution.insert(0, {'prob':1.0, 'error':0.0})
         if self.distribution[-1]['prob'] > 0.0:
           self.distribution.append({'prob':0.0, 'error':1.0})
         if self.is_debug:
           fname = self.pepxml + '.distribution.dump'
           pprint(self.distribution, open(fname, 'w'))
         elem.clear()
   if self.is_debug:
     self.debug_file.write(']\n')
     self.debug_file.close()
Exemplo n.º 6
0
def parse_protein_probabilities(elem, nsmap):
  probs = []
  for data_point in elem.findall(parse.fixtag('', 'protein_summary_data_filter', nsmap)):
    attrib = parse.parse_attrib(data_point)
    probs.append({
      'error': attrib['false_positive_error_rate'],
      'prob': attrib['min_probability'],
    })
  probs.sort(key=lambda d:d['error'])
  return probs
Exemplo n.º 7
0
def parse_protein_group(elem, nsmap):
  group = parse.parse_attrib(elem)
  group['proteins'] = []
  for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)):
    protein = parse.parse_attrib(protein_elem)
    protein['group_number'] = group['group_number']

    annotation_elem = protein_elem.find(parse.fixtag('', 'annotation', nsmap))
    if annotation_elem is not None:
      protein['description'] = annotation_elem.attrib['protein_description']

    protein['other_seqids'] = []
    for alt_protein in protein_elem.findall(parse.fixtag('', 'indistinguishable_protein', nsmap)):
      protein['other_seqids'].append(alt_protein.attrib['protein_name'])

    protein['other_seqids'] = protein['other_seqids']
    protein['protein_name'] = protein['protein_name']

    protein['peptides'] = []
    n_unique_peptide = 0
    for peptide_elem in protein_elem.findall(parse.fixtag('', 'peptide', nsmap)):
      peptide = parse.parse_attrib(peptide_elem)
      protein['peptides'].append(peptide)
      peptide['modifications'] = []
      peptide['modified_sequence'] = peptide['peptide_sequence']
      for modified_elem in peptide_elem.findall(parse.fixtag('', 'modification_info', nsmap)):
        attr = parse.parse_attrib(modified_elem)
        peptide['modified_sequence'] = attr['modified_peptide']
        for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
          attr = parse.parse_attrib(modification_elem)
          peptide['modifications'].append(attr)

    group['proteins'].append(protein)
  return group
Exemplo n.º 8
0
 def iter(self):
   for event, elem in etree.iterparse(self.pepxml, events=('start', 'end', 'start-ns')):
     if event == 'start-ns':
       self.nsmap.update({elem})
     elif event == 'start':
       if elem.tag == parse.fixtag('', 'msms_run_summary', self.nsmap):
         fname = elem.attrib['base_name']
         self.source_names.append(fname)
         self.i_source = len(self.source_names) - 1
     elif event == 'end':
       if elem.tag == parse.fixtag('', 'spectrum_query', self.nsmap):
         scan = parse_scan(elem, self.nsmap)
         if self.i_source is not None:
           scan['source'] = self.source_names[self.i_source]
         if self.prob_cutoff is None or scan['probability'] >= self.prob_cutoff:
           yield scan
         elem.clear()
       elif elem.tag == parse.fixtag('', 'peptideprophet_summary', self.nsmap):
         self.probs = parse_peptide_probabilities(elem, self.nsmap)
         if self.prob_cutoff is None and self.error_cutoff is not None:
           self.prob_cutoff = error_to_probability(self.probs, self.prob_cutoff)
         elem.clear()
Exemplo n.º 9
0
def read_pepxml(pepxml):
  nsmap = {}
  probs = []
  scan_sources = []
  for event, elem in etree.iterparse(pepxml, events=('start', 'end', 'start-ns')):
    if event == 'start-ns':
      nsmap.update({elem})
    elif event == 'start':
      if elem.tag == parse.fixtag('', 'msms_run_summary', nsmap):
        scan_source = {
          'scans': [],
          'filename': elem.attrib['base_name'],
        }
        scan_sources.append(scan_source)
    elif event == 'end':
      if elem.tag == parse.fixtag('', 'spectrum_query', nsmap):
        scan = parse_scan(elem, nsmap)
        scan_source['scans'].append(scan)
        elem.clear()
      elif elem.tag == parse.fixtag('', 'peptideprophet_summary', nsmap):
        probs = parse_peptide_probabilities(elem, nsmap)
        elem.clear()
  return scan_sources, probs
Exemplo n.º 10
0
def parse_protein_group(elem, nsmap):
    group = parse.parse_attrib(elem)
    group['proteins'] = []
    for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)):
        protein = parse.parse_attrib(protein_elem)
        protein['group_number'] = group['group_number']

        for parameter_elem in protein_elem.findall(
                parse.fixtag('', 'parameter', nsmap)):
            key = parameter_elem.attrib['name']
            val = parameter_elem.attrib['value']
            protein[key] = val

        annotation_elem = protein_elem.find(
            parse.fixtag('', 'annotation', nsmap))
        if annotation_elem is not None:
            protein['description'] = annotation_elem.attrib[
                'protein_description']

        protein['other_seqids'] = []
        for alt_protein in protein_elem.findall(
                parse.fixtag('', 'indistinguishable_protein', nsmap)):
            protein['other_seqids'].append(alt_protein.attrib['protein_name'])

        protein['other_seqids'] = protein['other_seqids']
        protein['protein_name'] = protein['protein_name']

        protein['peptides'] = []
        n_unique_peptide = 0
        for peptide_elem in protein_elem.findall(
                parse.fixtag('', 'peptide', nsmap)):
            peptide = parse.parse_attrib(peptide_elem)
            protein['peptides'].append(peptide)
            peptide['modifications'] = []
            peptide['modified_sequence'] = peptide['peptide_sequence']
            for modified_elem in peptide_elem.findall(
                    parse.fixtag('', 'modification_info', nsmap)):
                attr = parse.parse_attrib(modified_elem)
                peptide['modified_sequence'] = attr['modified_peptide']
                for modification_elem in modified_elem.findall(
                        parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
                    attr = parse.parse_attrib(modification_elem)
                    peptide['modifications'].append(attr)

        group['proteins'].append(protein)
    return group
Exemplo n.º 11
0
 def search_tag(self, tag):
     return parse.fixtag('', tag, self.nsmap)