def get_proteins_and_sources(protxml, pepxml, n_peptide_cutoff=1, is_skip_no_unique=True, errors=[0.01]): """ Basic structure proteins in YAML formt. "sample_seqid": sequence: "AAAAAAAAAA" description: "sample protein" attr: param: value sources: - peptides - sequence: "AAA" i: 0 j: 3 attr: is_unique: True param: value """ max_error = max(errors) protein_groups, protein_probs = read_protxml(protxml) proteins = make_proteins_from_protxml(protein_groups) dump_dir = os.path.dirname(protxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'protxml.dump') logger.debug('Dumping protxml data structure to ' + dump) parse.save_data_dict(protein_groups, dump) dump = os.path.join(dump_dir, 'proterror.dump') logger.debug('Dumping protein error distribution to ' + dump) parse.save_data_dict(protein_probs, dump) scans_by_sources, peptide_probs = read_pepxml(pepxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'pepxml.dump') logger.debug('Dumping pepxml data structure to ' + dump) parse.save_data_dict(scans_by_sources, dump) dump = os.path.join(dump_dir, 'peperror.dump') logger.debug('Dumping peptide error distribution to ' + dump) parse.save_data_dict(peptide_probs, dump) source_names = [scans['filename'] for scans in scans_by_sources] load_pepxml(proteins, scans_by_sources) probability = error_to_probability(peptide_probs, max_error) filter_peptides(proteins, probability) probabilities = [error_to_probability(peptide_probs, e) for e in errors] make_mask(proteins, probabilities) probability = error_to_probability(protein_probs, max_error) filter_proteins(proteins, probability) parse_proteins.determine_unique_peptides(proteins) parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
def get_proteins_and_sources( protxml, pepxml, n_peptide_cutoff=1, is_skip_no_unique=True, errors = [0.01]): """ Basic structure proteins in YAML formt. "sample_seqid": sequence: "AAAAAAAAAA" description: "sample protein" attr: param: value sources: - peptides - sequence: "AAA" i: 0 j: 3 attr: is_unique: True param: value """ max_error = max(errors) protein_groups, protein_probs = read_protxml(protxml) proteins = make_proteins_from_protxml(protein_groups) dump_dir = os.path.dirname(protxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'protxml.dump') logger.debug('Dumping protxml data structure to ' + dump) parse.save_data_dict(protein_groups, dump) dump = os.path.join(dump_dir, 'proterror.dump') logger.debug('Dumping protein error distribution to ' + dump) parse.save_data_dict(protein_probs, dump) scans_by_sources, peptide_probs = read_pepxml(pepxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'pepxml.dump') logger.debug('Dumping pepxml data structure to ' + dump) parse.save_data_dict(scans_by_sources, dump) dump = os.path.join(dump_dir, 'peperror.dump') logger.debug('Dumping peptide error distribution to ' + dump) parse.save_data_dict(peptide_probs, dump) source_names = [scans['filename'] for scans in scans_by_sources] load_pepxml(proteins, scans_by_sources) probability = error_to_probability(peptide_probs, max_error) filter_peptides(proteins, probability) probabilities = [error_to_probability(peptide_probs, e) for e in errors] make_mask(proteins, probabilities) probability = error_to_probability(protein_probs, max_error) filter_proteins(proteins, probability) parse_proteins.determine_unique_peptides(proteins) parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
def get_proteins_and_sources( in_dir, is_leu_ile_isomeric=False, ): peptide_list, scan_list, protein_group_list, evidence_list = \ read(in_dir) peptides = {int(p['id']): p for p in peptide_list} scans = {int(s['id']): s for s in scan_list} protein_groups = {int(p['id']): p for p in protein_group_list} evidence_dict = {int(e['id']): e for e in evidence_list} parse.save_data_dict(peptides, in_dir + '/peptides.dump') parse.save_data_dict(scans, in_dir + '/scans.dump') parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump') parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump') sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} proteins = {} protein_by_group_id = {} for group_id, protein_group in protein_groups.items(): protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'peptides': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein print("Matching sequences and scan in proteins") i_scan = 0 n_scan = len(scans) for scan_id, scan in scans.items(): i_scan += 1 if i_scan % 5000 == 0: print("{}/{} scans processed".format(i_scan, n_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): new_peptide = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'attr': { 'modifications': [], 'mq_scan_id': scan_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } transfer_attrs(scan, new_peptide['attr'], scan_parse_list) transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list) transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list) change_key(new_peptide['attr'], 'scan number', 'scan_id') change_key(new_peptide['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['peptides'].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False,): peptide_list, scan_list, protein_group_list, evidence_list = \ read(in_dir) peptides = { int(p['id']):p for p in peptide_list } scans = { int(s['id']):s for s in scan_list } protein_groups = { int(p['id']):p for p in protein_group_list } evidence_dict = { int(e['id']):e for e in evidence_list } parse.save_data_dict(peptides, in_dir + '/peptides.dump') parse.save_data_dict(scans, in_dir + '/scans.dump') parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump') parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump') sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source:k for k, source in enumerate(sources)} proteins = {} protein_by_group_id = {} for group_id, protein_group in protein_groups.items(): protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'peptides': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein print("Matching sequences and scan in proteins") i_scan = 0 n_scan = len(scans) for scan_id, scan in scans.items(): i_scan += 1 if i_scan % 5000 == 0: print("{}/{} scans processed".format(i_scan, n_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): new_peptide = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'attr' : { 'modifications': [], 'mq_scan_id': scan_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } transfer_attrs(scan, new_peptide['attr'], scan_parse_list) transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list) transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list) change_key(new_peptide['attr'], 'scan number', 'scan_id') change_key(new_peptide['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['peptides'].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False): evidence_fname = os.path.join(in_dir, "evidence.txt") logger.info("Loading evidence file: " + evidence_fname) evidence_iter = parse.read_tsv(evidence_fname) evidence_dict = {int(e["id"]): e for e in evidence_iter} sources_set = set(e["raw file"] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} protein_group_fname = os.path.join(in_dir, "proteinGroups.txt") logger.info("Loading protein groups: " + protein_group_fname) proteins = {} protein_by_group_id = {} for protein_group in parse.read_tsv(protein_group_fname): group_id = protein_group["id"] protein = { "description": "", "attr": {"group_id": group_id, "other_seqids": []}, "sources": [{"peptides": []} for k in range(len(i_sources))], } transfer_attrs(protein_group, protein["attr"], protein_parse_list) seqids = parse.splitter(protein_group["protein ids"]) proteins[seqids[0]] = protein protein["attr"]["seqid"] = seqids[0] protein["attr"]["other_seqids"] = seqids[1:] protein_by_group_id[group_id] = protein peptides_fname = os.path.join(in_dir, "peptides.txt") logger.info("Loading peptides file: " + peptides_fname) peptides_iter = parse.read_tsv(peptides_fname) peptides = {int(p["id"]): p for p in peptides_iter} scans_fname = os.path.join(in_dir, "msms.txt") logger.info("Loading scans and matching: " + scans_fname) i_scan = 0 for scan in parse.read_tsv(scans_fname): scan_id = int(scan["id"]) i_scan += 1 if i_scan % 5000 == 0: logger.info("{} scans processed".format(i_scan)) evidence_id = int(scan["evidence id"]) evidence = evidence_dict[evidence_id] peptide_id = int(scan["peptide id"]) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan["protein group ids"])): new_peptide = { "sequence": scan["sequence"], "spectrum": get_labeled_spectrum(scan), "attr": {"modifications": [], "mq_scan_id": scan_id, "is_unique": peptide["unique (groups)"] == "yes"}, } transfer_attrs(scan, new_peptide["attr"], scan_parse_list) transfer_attrs(evidence, new_peptide["attr"], evidence_parse_list) transfer_attrs(peptide, new_peptide["attr"], peptide_parse_list) change_key(new_peptide["attr"], "scan number", "scan_id") change_key(new_peptide["attr"], "retention time", "retention_time") protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence["raw file"]] protein["sources"][i_source]["peptides"].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources
def get_proteins_and_sources( in_dir, is_leu_ile_isomeric=False, ): evidence_fname = os.path.join(in_dir, 'evidence.txt') logger.info('Loading evidence file: ' + evidence_fname) evidence_iter = parse.read_tsv(evidence_fname) evidence_dict = {int(e['id']): e for e in evidence_iter} sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt') logger.info('Loading protein groups: ' + protein_group_fname) proteins = {} protein_by_group_id = {} for protein_group in parse.read_tsv(protein_group_fname): group_id = protein_group['id'] protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'peptides': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein peptides_fname = os.path.join(in_dir, 'peptides.txt') logger.info('Loading peptides file: ' + peptides_fname) peptides_iter = parse.read_tsv(peptides_fname) peptides = {int(p['id']): p for p in peptides_iter} scans_fname = os.path.join(in_dir, 'msms.txt') logger.info('Loading scans and matching: ' + scans_fname) i_scan = 0 for scan in parse.read_tsv(scans_fname): scan_id = int(scan['id']) i_scan += 1 if i_scan % 5000 == 0: logger.info("{} scans processed".format(i_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): new_peptide = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'attr': { 'modifications': [], 'mq_scan_id': scan_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } transfer_attrs(scan, new_peptide['attr'], scan_parse_list) transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list) transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list) change_key(new_peptide['attr'], 'scan number', 'scan_id') change_key(new_peptide['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['peptides'].append(new_peptide) parse_proteins.count_peptides(proteins) parse_proteins.delete_empty_proteins(proteins) return proteins, sources