Exemplo n.º 1
0
def get_proteins_and_sources(protxml,
                             pepxml,
                             n_peptide_cutoff=1,
                             is_skip_no_unique=True,
                             errors=[0.01]):
    """
  Basic structure proteins in YAML formt.
    "sample_seqid": 
      sequence: "AAAAAAAAAA"
      description: "sample protein"
      attr:
        param: value
      sources:
        -
          peptides
            -
              sequence: "AAA"
              i: 0
              j: 3
              attr:
                is_unique: True
                param: value
  """
    max_error = max(errors)
    protein_groups, protein_probs = read_protxml(protxml)
    proteins = make_proteins_from_protxml(protein_groups)

    dump_dir = os.path.dirname(protxml)
    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'protxml.dump')
        logger.debug('Dumping protxml data structure to ' + dump)
        parse.save_data_dict(protein_groups, dump)
        dump = os.path.join(dump_dir, 'proterror.dump')
        logger.debug('Dumping protein error distribution to ' + dump)
        parse.save_data_dict(protein_probs, dump)

    scans_by_sources, peptide_probs = read_pepxml(pepxml)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'pepxml.dump')
        logger.debug('Dumping pepxml data structure to ' + dump)
        parse.save_data_dict(scans_by_sources, dump)
        dump = os.path.join(dump_dir, 'peperror.dump')
        logger.debug('Dumping peptide error distribution to ' + dump)
        parse.save_data_dict(peptide_probs, dump)

    source_names = [scans['filename'] for scans in scans_by_sources]
    load_pepxml(proteins, scans_by_sources)
    probability = error_to_probability(peptide_probs, max_error)
    filter_peptides(proteins, probability)
    probabilities = [error_to_probability(peptide_probs, e) for e in errors]
    make_mask(proteins, probabilities)
    probability = error_to_probability(protein_probs, max_error)
    filter_proteins(proteins, probability)
    parse_proteins.determine_unique_peptides(proteins)
    parse_proteins.count_peptides(proteins, n_peptide_cutoff,
                                  is_skip_no_unique)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'proteins.dump')
        logger.debug('Dumping protein data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    return proteins, source_names
Exemplo n.º 2
0
def get_proteins_and_sources(
    protxml, pepxml, 
    n_peptide_cutoff=1, 
    is_skip_no_unique=True,
    errors = [0.01]):
  """
  Basic structure proteins in YAML formt.
    "sample_seqid": 
      sequence: "AAAAAAAAAA"
      description: "sample protein"
      attr:
        param: value
      sources:
        -
          peptides
            -
              sequence: "AAA"
              i: 0
              j: 3
              attr:
                is_unique: True
                param: value
  """
  max_error = max(errors)
  protein_groups, protein_probs = read_protxml(protxml)
  proteins = make_proteins_from_protxml(protein_groups)

  dump_dir = os.path.dirname(protxml)
  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'protxml.dump')
    logger.debug('Dumping protxml data structure to ' + dump)
    parse.save_data_dict(protein_groups, dump)
    dump = os.path.join(dump_dir, 'proterror.dump')
    logger.debug('Dumping protein error distribution to ' + dump)
    parse.save_data_dict(protein_probs, dump)

  scans_by_sources, peptide_probs = read_pepxml(pepxml)

  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'pepxml.dump')
    logger.debug('Dumping pepxml data structure to ' + dump)
    parse.save_data_dict(scans_by_sources, dump)
    dump = os.path.join(dump_dir, 'peperror.dump')
    logger.debug('Dumping peptide error distribution to ' + dump)
    parse.save_data_dict(peptide_probs, dump)

  source_names = [scans['filename'] for scans in scans_by_sources]
  load_pepxml(proteins, scans_by_sources)
  probability = error_to_probability(peptide_probs, max_error)
  filter_peptides(proteins, probability)
  probabilities = [error_to_probability(peptide_probs, e) for e in errors]
  make_mask(proteins, probabilities)
  probability = error_to_probability(protein_probs, max_error)
  filter_proteins(proteins, probability)
  parse_proteins.determine_unique_peptides(proteins)
  parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique)

  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'proteins.dump')
    logger.debug('Dumping protein data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  return proteins, source_names
Exemplo n.º 3
0
def get_proteins_and_sources(
    in_dir,
    is_leu_ile_isomeric=False,
):
    peptide_list, scan_list, protein_group_list, evidence_list = \
          read(in_dir)

    peptides = {int(p['id']): p for p in peptide_list}
    scans = {int(s['id']): s for s in scan_list}
    protein_groups = {int(p['id']): p for p in protein_group_list}
    evidence_dict = {int(e['id']): e for e in evidence_list}

    parse.save_data_dict(peptides, in_dir + '/peptides.dump')
    parse.save_data_dict(scans, in_dir + '/scans.dump')
    parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
    parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    proteins = {}
    protein_by_group_id = {}
    for group_id, protein_group in protein_groups.items():
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'peptides': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    print("Matching sequences and scan in proteins")
    i_scan = 0
    n_scan = len(scans)
    for scan_id, scan in scans.items():
        i_scan += 1
        if i_scan % 5000 == 0:
            print("{}/{} scans processed".format(i_scan, n_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            new_peptide = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'attr': {
                    'modifications': [],
                    'mq_scan_id': scan_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }
            transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
            transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
            transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
            change_key(new_peptide['attr'], 'scan number', 'scan_id')
            change_key(new_peptide['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['peptides'].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources
Exemplo n.º 4
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False,):
  peptide_list, scan_list, protein_group_list, evidence_list = \
        read(in_dir)

  peptides = { int(p['id']):p for p in peptide_list }
  scans = { int(s['id']):s for s in scan_list }
  protein_groups = { int(p['id']):p for p in protein_group_list }
  evidence_dict = { int(e['id']):e for e in evidence_list }

  parse.save_data_dict(peptides, in_dir + '/peptides.dump')
  parse.save_data_dict(scans, in_dir + '/scans.dump')
  parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
  parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

  sources_set = set(e['raw file'] for e in evidence_dict.values())
  sources = [str(s) for s in sorted(sources_set)]
  i_sources = {source:k for k, source in enumerate(sources)}

  proteins = {}
  protein_by_group_id = {}
  for group_id, protein_group in protein_groups.items():
    protein = {
      'description': '',
      'attr': { 
        'group_id': group_id,
        'other_seqids': [],
      },
      'sources': [{ 'peptides': [] } for k in range(len(i_sources))],
    }
    transfer_attrs(protein_group, protein['attr'], protein_parse_list)

    seqids = parse.splitter(protein_group['protein ids'])
    proteins[seqids[0]] = protein
    protein['attr']['seqid'] = seqids[0]
    protein['attr']['other_seqids'] = seqids[1:]
    protein_by_group_id[group_id] = protein

  print("Matching sequences and scan in proteins")
  i_scan = 0
  n_scan = len(scans)
  for scan_id, scan in scans.items():
    i_scan += 1
    if i_scan % 5000 == 0:
      print("{}/{} scans processed".format(i_scan, n_scan))
    evidence_id = int(scan['evidence id'])
    evidence = evidence_dict[evidence_id]

    peptide_id = int(scan['peptide id'])
    peptide = peptides[peptide_id]
    for group_id in parse.splitter(str(scan['protein group ids'])):
      new_peptide = {
        'sequence': scan['sequence'],
        'spectrum': get_labeled_spectrum(scan),
        'attr' : {
          'modifications': [],
          'mq_scan_id': scan_id,
          'is_unique': peptide['unique (groups)'] == 'yes',
        }
      }
      transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
      transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
      transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
      change_key(new_peptide['attr'], 'scan number', 'scan_id')
      change_key(new_peptide['attr'], 'retention time', 'retention_time')
      
      protein = protein_by_group_id[int(group_id)]
      i_source = i_sources[evidence['raw file']]
      protein['sources'][i_source]['peptides'].append(new_peptide)

  parse_proteins.count_peptides(proteins)
  
  return proteins, sources
Exemplo n.º 5
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False):

    evidence_fname = os.path.join(in_dir, "evidence.txt")
    logger.info("Loading evidence file: " + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e["id"]): e for e in evidence_iter}

    sources_set = set(e["raw file"] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, "proteinGroups.txt")
    logger.info("Loading protein groups: " + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group["id"]
        protein = {
            "description": "",
            "attr": {"group_id": group_id, "other_seqids": []},
            "sources": [{"peptides": []} for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein["attr"], protein_parse_list)

        seqids = parse.splitter(protein_group["protein ids"])
        proteins[seqids[0]] = protein
        protein["attr"]["seqid"] = seqids[0]
        protein["attr"]["other_seqids"] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, "peptides.txt")
    logger.info("Loading peptides file: " + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p["id"]): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, "msms.txt")
    logger.info("Loading scans and matching: " + scans_fname)
    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan["id"])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan["evidence id"])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan["peptide id"])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan["protein group ids"])):
            new_peptide = {
                "sequence": scan["sequence"],
                "spectrum": get_labeled_spectrum(scan),
                "attr": {"modifications": [], "mq_scan_id": scan_id, "is_unique": peptide["unique (groups)"] == "yes"},
            }
            transfer_attrs(scan, new_peptide["attr"], scan_parse_list)
            transfer_attrs(evidence, new_peptide["attr"], evidence_parse_list)
            transfer_attrs(peptide, new_peptide["attr"], peptide_parse_list)
            change_key(new_peptide["attr"], "scan number", "scan_id")
            change_key(new_peptide["attr"], "retention time", "retention_time")

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence["raw file"]]
            protein["sources"][i_source]["peptides"].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources
Exemplo n.º 6
0
def get_proteins_and_sources(
    in_dir,
    is_leu_ile_isomeric=False,
):

    evidence_fname = os.path.join(in_dir, 'evidence.txt')
    logger.info('Loading evidence file: ' + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e['id']): e for e in evidence_iter}

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt')
    logger.info('Loading protein groups: ' + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group['id']
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'peptides': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, 'peptides.txt')
    logger.info('Loading peptides file: ' + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p['id']): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, 'msms.txt')
    logger.info('Loading scans and matching: ' + scans_fname)
    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan['id'])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            new_peptide = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'attr': {
                    'modifications': [],
                    'mq_scan_id': scan_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }
            transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
            transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
            transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
            change_key(new_peptide['attr'], 'scan number', 'scan_id')
            change_key(new_peptide['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['peptides'].append(new_peptide)

    parse_proteins.count_peptides(proteins)
    parse_proteins.delete_empty_proteins(proteins)

    return proteins, sources