def read_modification_dict(modifications_tsv): result = {} for entry in parse.read_tsv(modifications_tsv): # pprint(entry) key = entry['description'] if 'residue' in entry: aa = entry['residue'] elif 'amino acid residue' in entry: aa = entry['amino acid residue'] delta_mass = entry['monoisotopic mass shift (da)'] if aa == 'n/a': continue mass = peptidemass.aa_monoisotopic_mass[aa] + float(delta_mass) result[key] = mass return result
def read_modification_dict(modifications_tsv): result = {} for entry in parse.read_tsv(modifications_tsv): key = entry['description'] mass = float(entry['monoisotopic mass shift (da)']) if 'residue' in entry: aa = entry['residue'] elif 'amino acid residue' in entry: aa = entry['amino acid residue'] else: aa = 'n/a' if aa != 'n/a': mass += peptidemass.aa_monoisotopic_mass[aa] result[key] = mass return result
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} peptides = parse.read_tsv(psm_fname) protein_groups = parse.read_tsv(protein_groups_fname) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'peptides.dump') logger.debug('Dumping peptides data structure to ' + dump) parse.save_data_dict(peptides, dump) dump = os.path.join(dump_dir, 'protein_groups.dump') logger.debug('Dumping protein_groups data structure to ' + dump) parse.save_data_dict(protein_groups, dump) proteins = {} for i_group, protein_group in enumerate(protein_groups): descriptions = protein_group['protein description'].split(' / ') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning( "Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': protein_group['protein sequence'], 'attr': { 'coverage': protein_group['protein sequence coverage (%)'], 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides': [] }] } proteins[seqids[0]] = protein protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein unmatched_peptides = [] n_peptide_matched = 0 for src_peptide in peptides: descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break if protein is None: unmatched_peptides.append(src_peptide) continue n_peptide_matched += 1 sequence = protein['sequence'] peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] i = sequence.index(peptide_sequence) peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': src_peptide['scan number'], 'retention_time': parse.round_decimal(src_peptide['retention time (min)'], 4), 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), }, 'intensity': src_peptide['morpheus score'] / len(peptide_sequence), 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal( modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping proteins data structure to ' + dump) if logger.root.level <= logging.DEBUG: parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format( n_peptide_matched, len(unmatched_peptides))) return proteins
def get_proteins_and_sources(in_dir, great_expect=1E-8, cutoff_expect=1E-2): evidence_fname = os.path.join(in_dir, 'evidence.txt') logger.info('Loading evidence file: ' + evidence_fname) evidence_iter = parse.read_tsv(evidence_fname) evidence_dict = {int(e['id']): e for e in evidence_iter} sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt') logger.info('Loading protein groups: ' + protein_group_fname) proteins = {} protein_by_group_id = {} for protein_group in parse.read_tsv(protein_group_fname): group_id = protein_group['id'] protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'matches': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein peptides_fname = os.path.join(in_dir, 'peptides.txt') logger.info('Loading peptides file: ' + peptides_fname) peptides_iter = parse.read_tsv(peptides_fname) peptides = {int(p['id']): p for p in peptides_iter} scans_fname = os.path.join(in_dir, 'msms.txt') logger.info('Loading scans and matching: ' + scans_fname) i_scan = 0 for scan in parse.read_tsv(scans_fname): scan_id = int(scan['id']) i_scan += 1 if i_scan % 5000 == 0: logger.info("{} scans processed".format(i_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] mod_seq = evidence['modified sequence'] mod_peptide_id = evidence['mod. peptide id'] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): match = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'modifications': get_modifications(scan), 'attr': { 'modified_sequence': mod_seq, 'mq_scan_id': scan_id, 'evidence_id': evidence_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } if scan['pep'] > cutoff_expect: continue match['intensity'] = parse_proteins.calc_minus_log_intensity( scan['pep'], great_expect, cutoff_expect) transfer_attrs(scan, match['attr'], scan_parse_list) transfer_attrs(evidence, match['attr'], evidence_parse_list) transfer_attrs(peptide, match['attr'], peptide_parse_list) change_key(match['attr'], 'scan number', 'scan_id') change_key(match['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['matches'].append(match) parse_proteins.count_matches(proteins) parse_proteins.delete_empty_proteins(proteins) return proteins, sources
def read(in_dir): peptides = parse.read_tsv(os.path.join(in_dir, 'peptides.txt')) scans = parse.read_tsv(os.path.join(in_dir, 'msms.txt')) protein_groups = parse.read_tsv(os.path.join(in_dir, 'proteinGroups.txt')) evidence = parse.read_tsv(os.path.join(in_dir, 'evidence.txt')) return peptides, scans, protein_groups, evidence
def get_proteins_and_sources( in_dir, great_expect=1E-8, cutoff_expect=1E-2): evidence_fname = os.path.join(in_dir, 'evidence.txt') logger.info('Loading evidence file: ' + evidence_fname) evidence_iter = parse.read_tsv(evidence_fname) evidence_dict = { int(e['id']):e for e in evidence_iter } sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source:k for k, source in enumerate(sources)} protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt') logger.info('Loading protein groups: ' + protein_group_fname) proteins = {} protein_by_group_id = {} for protein_group in parse.read_tsv(protein_group_fname): group_id = protein_group['id'] protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'matches': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein peptides_fname = os.path.join(in_dir, 'peptides.txt') logger.info('Loading peptides file: ' + peptides_fname) peptides_iter = parse.read_tsv(peptides_fname) peptides = { int(p['id']):p for p in peptides_iter } scans_fname = os.path.join(in_dir, 'msms.txt') logger.info('Loading scans and matching: ' + scans_fname) i_scan = 0 for scan in parse.read_tsv(scans_fname): scan_id = int(scan['id']) i_scan += 1 if i_scan % 5000 == 0: logger.info("{} scans processed".format(i_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] mod_seq = evidence['modified sequence'] mod_peptide_id = evidence['mod. peptide id'] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): match = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'modifications': get_modifications(scan), 'attr' : { 'modified_sequence': mod_seq, 'mq_scan_id': scan_id, 'evidence_id': evidence_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } if scan['pep'] > cutoff_expect: continue match['intensity'] = parse_proteins.calc_minus_log_intensity( scan['pep'], great_expect, cutoff_expect) transfer_attrs(scan, match['attr'], scan_parse_list) transfer_attrs(evidence, match['attr'], evidence_parse_list) transfer_attrs(peptide, match['attr'], peptide_parse_list) change_key(match['attr'], 'scan number', 'scan_id') change_key(match['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['matches'].append(match) parse_proteins.count_matches(proteins) parse_proteins.delete_empty_proteins(proteins) return proteins, sources
def get_proteins_and_sources( protein_groups_fname, psm_fname, modifications_fname=None, q_good=0.0, q_cutoff=10): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) modification_table = {} if modifications_fname: modification_table = read_modification_dict(modifications_fname) proteins = {} dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate(parse.read_tsv(protein_groups_fname)): protein = make_protein(i_group, protein_group) proteins[protein['attr']['seqid']] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_match = 0 n_match_assigned = 0 i_source_from_source = {} sources = [] for psm in parse.read_tsv(psm_fname): dict_dump_writer.dump_dict(psm) match = make_match(psm, modification_table) match['intensity'] = parse_proteins.calc_intensity( match['attr']['q_value'], q_good, q_cutoff) if match['attr']['q_value'] > q_cutoff: continue peptide_sequence = match['sequence'] n_match += 1 protein = None descriptions = psm['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: test_protein = protein_by_seqid[peptide_seqid] sequence = protein_by_seqid[peptide_seqid]['sequence'] if peptide_sequence in sequence: protein = test_protein break else: logger.debug("Couldn't find protein for %s" % (peptide_sequence)) continue match['i'] = sequence.find(peptide_sequence) n_match_assigned += 1 i_source = get_i_source(proteins, sources, psm['filename']) protein['sources'][i_source]['matches'].append(match) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_match_assigned, n_match)) return proteins, sources
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False): evidence_fname = os.path.join(in_dir, "evidence.txt") logger.info("Loading evidence file: " + evidence_fname) evidence_iter = parse.read_tsv(evidence_fname) evidence_dict = {int(e["id"]): e for e in evidence_iter} sources_set = set(e["raw file"] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} protein_group_fname = os.path.join(in_dir, "proteinGroups.txt") logger.info("Loading protein groups: " + protein_group_fname) proteins = {} protein_by_group_id = {} for protein_group in parse.read_tsv(protein_group_fname): group_id = protein_group["id"] protein = { "description": "", "attr": {"group_id": group_id, "other_seqids": []}, "sources": [{"peptides": []} for k in range(len(i_sources))], } transfer_attrs(protein_group, protein["attr"], protein_parse_list) seqids = parse.splitter(protein_group["protein ids"]) proteins[seqids[0]] = protein protein["attr"]["seqid"] = seqids[0] protein["attr"]["other_seqids"] = seqids[1:] protein_by_group_id[group_id] = protein peptides_fname = os.path.join(in_dir, "peptides.txt") logger.info("Loading peptides file: " + peptides_fname) peptides_iter = parse.read_tsv(peptides_fname) peptides = {int(p["id"]): p for p in peptides_iter} scans_fname = os.path.join(in_dir, "msms.txt") logger.info("Loading scans and matching: " + scans_fname) i_scan = 0 for scan in parse.read_tsv(scans_fname): scan_id = int(scan["id"]) i_scan += 1 if i_scan % 5000 == 0: logger.info("{} scans processed".format(i_scan)) evidence_id = int(scan["evidence id"]) evidence = evidence_dict[evidence_id] peptide_id = int(scan["peptide id"]) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan["protein group ids"])): new_peptide = { "sequence": scan["sequence"], "spectrum": get_labeled_spectrum(scan), "attr": {"modifications": [], "mq_scan_id": scan_id, "is_unique": peptide["unique (groups)"] == "yes"}, } transfer_attrs(scan, new_peptide["attr"], scan_parse_list) transfer_attrs(evidence, new_peptide["attr"], evidence_parse_list) transfer_attrs(peptide, new_peptide["attr"], peptide_parse_list) change_key(new_peptide["attr"], "scan number", "scan_id") change_key(new_peptide["attr"], "retention time", "retention_time") protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence["raw file"]] protein["sources"][i_source]["peptides"].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources