Пример #1
0
def get_proteins_and_sources(in_dir, great_expect=1E-8, cutoff_expect=1E-2):

    evidence_fname = os.path.join(in_dir, 'evidence.txt')
    logger.info('Loading evidence file: ' + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e['id']): e for e in evidence_iter}

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt')
    logger.info('Loading protein groups: ' + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group['id']
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'matches': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, 'peptides.txt')
    logger.info('Loading peptides file: ' + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p['id']): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, 'msms.txt')
    logger.info('Loading scans and matching: ' + scans_fname)

    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan['id'])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]
        mod_seq = evidence['modified sequence']
        mod_peptide_id = evidence['mod. peptide id']

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            match = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'modifications': get_modifications(scan),
                'attr': {
                    'modified_sequence': mod_seq,
                    'mq_scan_id': scan_id,
                    'evidence_id': evidence_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }

            if scan['pep'] > cutoff_expect:
                continue

            match['intensity'] = parse_proteins.calc_minus_log_intensity(
                scan['pep'], great_expect, cutoff_expect)

            transfer_attrs(scan, match['attr'], scan_parse_list)
            transfer_attrs(evidence, match['attr'], evidence_parse_list)
            transfer_attrs(peptide, match['attr'], peptide_parse_list)
            change_key(match['attr'], 'scan number', 'scan_id')
            change_key(match['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['matches'].append(match)

    parse_proteins.count_matches(proteins)
    parse_proteins.delete_empty_proteins(proteins)

    return proteins, sources
Пример #2
0
def get_labeled_spectrum(scan):
    masses = parse.splitter(scan['masses'], float)
    intensities = parse.splitter(scan['intensities'], float)
    labels = parse.splitter(scan['matches'])
    return [x for x in zip(masses, intensities, labels)]
Пример #3
0
def get_proteins_and_sources(
    in_dir,
    is_leu_ile_isomeric=False,
):
    peptide_list, scan_list, protein_group_list, evidence_list = \
          read(in_dir)

    peptides = {int(p['id']): p for p in peptide_list}
    scans = {int(s['id']): s for s in scan_list}
    protein_groups = {int(p['id']): p for p in protein_group_list}
    evidence_dict = {int(e['id']): e for e in evidence_list}

    parse.save_data_dict(peptides, in_dir + '/peptides.dump')
    parse.save_data_dict(scans, in_dir + '/scans.dump')
    parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
    parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    proteins = {}
    protein_by_group_id = {}
    for group_id, protein_group in protein_groups.items():
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'peptides': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    print("Matching sequences and scan in proteins")
    i_scan = 0
    n_scan = len(scans)
    for scan_id, scan in scans.items():
        i_scan += 1
        if i_scan % 5000 == 0:
            print("{}/{} scans processed".format(i_scan, n_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            new_peptide = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'attr': {
                    'modifications': [],
                    'mq_scan_id': scan_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }
            transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
            transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
            transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
            change_key(new_peptide['attr'], 'scan number', 'scan_id')
            change_key(new_peptide['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['peptides'].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources
Пример #4
0
def get_labeled_spectrum(scan):
  masses = parse.splitter(scan['masses'], float)
  intensities = parse.splitter(scan['intensities'], float)
  labels = parse.splitter(scan['matches'])
  return [x for x in zip(masses, intensities, labels)]
Пример #5
0
def get_proteins_and_sources(
    in_dir,
    great_expect=1E-8, 
    cutoff_expect=1E-2):

  evidence_fname = os.path.join(in_dir, 'evidence.txt')
  logger.info('Loading evidence file: ' + evidence_fname)
  evidence_iter = parse.read_tsv(evidence_fname)
  evidence_dict = { int(e['id']):e for e in evidence_iter }

  sources_set = set(e['raw file'] for e in evidence_dict.values())
  sources = [str(s) for s in sorted(sources_set)]
  i_sources = {source:k for k, source in enumerate(sources)}

  protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt')
  logger.info('Loading protein groups: ' + protein_group_fname)
  proteins = {}
  protein_by_group_id = {}
  for protein_group in parse.read_tsv(protein_group_fname):
    group_id = protein_group['id']
    protein = {
      'description': '',
      'attr': { 
        'group_id': group_id,
        'other_seqids': [],
      },
      'sources': [{ 'matches': [] } for k in range(len(i_sources))],
    }
    transfer_attrs(protein_group, protein['attr'], protein_parse_list)

    seqids = parse.splitter(protein_group['protein ids'])
    proteins[seqids[0]] = protein
    protein['attr']['seqid'] = seqids[0]
    protein['attr']['other_seqids'] = seqids[1:]
    protein_by_group_id[group_id] = protein

  peptides_fname = os.path.join(in_dir, 'peptides.txt')
  logger.info('Loading peptides file: ' + peptides_fname)
  peptides_iter = parse.read_tsv(peptides_fname)
  peptides = { int(p['id']):p for p in peptides_iter }

  scans_fname = os.path.join(in_dir, 'msms.txt')
  logger.info('Loading scans and matching: ' + scans_fname)

  i_scan = 0
  for scan in parse.read_tsv(scans_fname):
    scan_id = int(scan['id'])
    i_scan += 1
    if i_scan % 5000 == 0:
      logger.info("{} scans processed".format(i_scan))
    evidence_id = int(scan['evidence id'])
    evidence = evidence_dict[evidence_id]
    mod_seq = evidence['modified sequence']
    mod_peptide_id = evidence['mod. peptide id']

    peptide_id = int(scan['peptide id'])
    peptide = peptides[peptide_id]
    for group_id in parse.splitter(str(scan['protein group ids'])):
      match = {
        'sequence': scan['sequence'],
        'spectrum': get_labeled_spectrum(scan),
        'modifications': get_modifications(scan),
        'attr' : {
          'modified_sequence': mod_seq,
          'mq_scan_id': scan_id,
          'evidence_id': evidence_id,
          'is_unique': peptide['unique (groups)'] == 'yes',
        }
      }

      if scan['pep'] > cutoff_expect:
        continue
        
      match['intensity'] = parse_proteins.calc_minus_log_intensity(
        scan['pep'], great_expect, cutoff_expect)

      transfer_attrs(scan, match['attr'], scan_parse_list)
      transfer_attrs(evidence, match['attr'], evidence_parse_list)
      transfer_attrs(peptide, match['attr'], peptide_parse_list)
      change_key(match['attr'], 'scan number', 'scan_id')
      change_key(match['attr'], 'retention time', 'retention_time')
      
      protein = protein_by_group_id[int(group_id)]
      i_source = i_sources[evidence['raw file']]
      protein['sources'][i_source]['matches'].append(match)

  parse_proteins.count_matches(proteins)
  parse_proteins.delete_empty_proteins(proteins)
  
  return proteins, sources
Пример #6
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False,):
  peptide_list, scan_list, protein_group_list, evidence_list = \
        read(in_dir)

  peptides = { int(p['id']):p for p in peptide_list }
  scans = { int(s['id']):s for s in scan_list }
  protein_groups = { int(p['id']):p for p in protein_group_list }
  evidence_dict = { int(e['id']):e for e in evidence_list }

  parse.save_data_dict(peptides, in_dir + '/peptides.dump')
  parse.save_data_dict(scans, in_dir + '/scans.dump')
  parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
  parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

  sources_set = set(e['raw file'] for e in evidence_dict.values())
  sources = [str(s) for s in sorted(sources_set)]
  i_sources = {source:k for k, source in enumerate(sources)}

  proteins = {}
  protein_by_group_id = {}
  for group_id, protein_group in protein_groups.items():
    protein = {
      'description': '',
      'attr': { 
        'group_id': group_id,
        'other_seqids': [],
      },
      'sources': [{ 'peptides': [] } for k in range(len(i_sources))],
    }
    transfer_attrs(protein_group, protein['attr'], protein_parse_list)

    seqids = parse.splitter(protein_group['protein ids'])
    proteins[seqids[0]] = protein
    protein['attr']['seqid'] = seqids[0]
    protein['attr']['other_seqids'] = seqids[1:]
    protein_by_group_id[group_id] = protein

  print("Matching sequences and scan in proteins")
  i_scan = 0
  n_scan = len(scans)
  for scan_id, scan in scans.items():
    i_scan += 1
    if i_scan % 5000 == 0:
      print("{}/{} scans processed".format(i_scan, n_scan))
    evidence_id = int(scan['evidence id'])
    evidence = evidence_dict[evidence_id]

    peptide_id = int(scan['peptide id'])
    peptide = peptides[peptide_id]
    for group_id in parse.splitter(str(scan['protein group ids'])):
      new_peptide = {
        'sequence': scan['sequence'],
        'spectrum': get_labeled_spectrum(scan),
        'attr' : {
          'modifications': [],
          'mq_scan_id': scan_id,
          'is_unique': peptide['unique (groups)'] == 'yes',
        }
      }
      transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
      transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
      transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
      change_key(new_peptide['attr'], 'scan number', 'scan_id')
      change_key(new_peptide['attr'], 'retention time', 'retention_time')
      
      protein = protein_by_group_id[int(group_id)]
      i_source = i_sources[evidence['raw file']]
      protein['sources'][i_source]['peptides'].append(new_peptide)

  parse_proteins.count_peptides(proteins)
  
  return proteins, sources
Пример #7
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False):

    evidence_fname = os.path.join(in_dir, "evidence.txt")
    logger.info("Loading evidence file: " + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e["id"]): e for e in evidence_iter}

    sources_set = set(e["raw file"] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, "proteinGroups.txt")
    logger.info("Loading protein groups: " + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group["id"]
        protein = {
            "description": "",
            "attr": {"group_id": group_id, "other_seqids": []},
            "sources": [{"peptides": []} for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein["attr"], protein_parse_list)

        seqids = parse.splitter(protein_group["protein ids"])
        proteins[seqids[0]] = protein
        protein["attr"]["seqid"] = seqids[0]
        protein["attr"]["other_seqids"] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, "peptides.txt")
    logger.info("Loading peptides file: " + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p["id"]): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, "msms.txt")
    logger.info("Loading scans and matching: " + scans_fname)
    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan["id"])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan["evidence id"])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan["peptide id"])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan["protein group ids"])):
            new_peptide = {
                "sequence": scan["sequence"],
                "spectrum": get_labeled_spectrum(scan),
                "attr": {"modifications": [], "mq_scan_id": scan_id, "is_unique": peptide["unique (groups)"] == "yes"},
            }
            transfer_attrs(scan, new_peptide["attr"], scan_parse_list)
            transfer_attrs(evidence, new_peptide["attr"], evidence_parse_list)
            transfer_attrs(peptide, new_peptide["attr"], peptide_parse_list)
            change_key(new_peptide["attr"], "scan number", "scan_id")
            change_key(new_peptide["attr"], "retention time", "retention_time")

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence["raw file"]]
            protein["sources"][i_source]["peptides"].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources