def make_peptide(pepxml_match, pepxml_scan, source): peptide = { 'sequence': pepxml_match['peptide'], 'modified_sequence': pepxml_match['modified_sequence'], 'intensity': pepxml_match['probability'], 'mask': pepxml_match['fpe'], 'attr': { 'pepxml_id': pepxml_scan['index'], 'scan_id': pepxml_scan['start_scan'], 'charge': pepxml_scan['assumed_charge'], 'expect': pepxml_match['expect'], 'modifications': pepxml_match['modifications'], 'probability': pepxml_match['probability'], 'missed_cleavages': pepxml_match['num_missed_cleavages'], 'mass': pepxml_scan['precursor_neutral_mass'], 'mass_diff': pepxml_match['massdiff'], 'source': parse.basename(source), } } def grab_opt(peptide_key, scan_key, source_dict): if scan_key in source_dict: peptide['attr'][peptide_key] = source_dict[scan_key] grab_opt('retention_time', 'retention_time_sec', pepxml_scan) grab_opt('score', 'ionscore', pepxml_match) grab_opt('homology', 'homologyscore', pepxml_match) grab_opt('identity', 'identityscore', pepxml_match) peptide['attr']['matched_ions'] = str(pepxml_match['num_matched_ions']) peptide['attr']['matched_ions'] += '/' peptide['attr']['matched_ions'] += str(pepxml_match['tot_num_ions']) return peptide
def make_match(psm, modification_table): extracted_peptide_sequence, modifications = parse_peptide( psm['peptide sequence'], modification_table) peptide_sequence = psm['base peptide sequence'] if extracted_peptide_sequence != peptide_sequence: logger.debug("Peptide sequences don't match: " + psm['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence) q_value = float(psm['q-value (%)']) if 'scan number' in psm: scan_id = psm['scan number'] elif 'spectrum number' in psm: scan_id = psm['spectrum number'] else: scan_id = '' if 'retention time (min)' in psm: time = parse.round_decimal(psm['retention time (min)'], 4) elif 'retention time (minutes)' in psm: time = parse.round_decimal(psm['retention time (minutes)'], 4) else: time = '' match = { 'sequence': peptide_sequence, 'attr': { 'scan_id': scan_id, 'retention_time': time, 'morpheus_score': parse.round_decimal(psm['morpheus score'], 4), 'charge': int(psm['precursor charge']), 'mass': parse.round_decimal(psm['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(psm['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(psm['precursor m/z'], 4), 'source': parse.basename(psm['filename']), 'missed_cleavages': int(psm['missed cleavages']), 'q_value': q_value, }, 'modifications': [], 'intensity': 1.0, 'i': -1, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal(modification['mass'], 4) match['modifications'] = modifications modified_sequence = psm['peptide sequence'].split('.')[1] match['attr']['modified_sequence'] = modified_sequence return match
def make_peptide(pepxml_peptide, pepxml_scan, source): peptide = { 'sequence': pepxml_peptide['peptide'], 'attr': { 'pepxml_id': pepxml_scan['index'], 'scan_id': pepxml_scan['start_scan'], 'expect': pepxml_peptide['expect'], 'retention_time': pepxml_scan['retention_time_sec'], 'modifications': pepxml_peptide['modifications'], 'source': parse.basename(source), } } peptide['attr']['matched_ions'] = str(pepxml_peptide['num_matched_ions']) peptide['attr']['matched_ions'] += '/' peptide['attr']['matched_ions'] += str(pepxml_peptide['tot_num_ions']) peptide['attr']['probability'] = pepxml_peptide['probability'] peptide['attr']['missed_cleavages'] = pepxml_peptide['num_missed_cleavages'] peptide['attr']['mass'] = pepxml_scan['precursor_neutral_mass'] peptide['attr']['mass_diff'] = pepxml_peptide['massdiff'] peptide['intensity'] = pepxml_peptide['probability'] return peptide
def make_peptide(pepxml_peptide, pepxml_scan, source): peptide = { 'sequence': pepxml_peptide['peptide'], 'attr': { 'pepxml_id': pepxml_scan['index'], 'scan_id': pepxml_scan['start_scan'], 'expect': pepxml_peptide['expect'], 'retention_time': pepxml_scan['retention_time_sec'], 'modifications': pepxml_peptide['modifications'], 'source': parse.basename(source), } } peptide['attr']['matched_ions'] = str(pepxml_peptide['num_matched_ions']) peptide['attr']['matched_ions'] += '/' peptide['attr']['matched_ions'] += str(pepxml_peptide['tot_num_ions']) peptide['attr']['probability'] = pepxml_peptide['probability'] peptide['attr']['missed_cleavages'] = pepxml_peptide[ 'num_missed_cleavages'] peptide['attr']['mass'] = pepxml_scan['precursor_neutral_mass'] peptide['attr']['mass_diff'] = pepxml_peptide['massdiff'] peptide['intensity'] = pepxml_peptide['probability'] return peptide
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} peptides = parse.read_tsv(psm_fname) protein_groups = parse.read_tsv(protein_groups_fname) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'peptides.dump') logger.debug('Dumping peptides data structure to ' + dump) parse.save_data_dict(peptides, dump) dump = os.path.join(dump_dir, 'protein_groups.dump') logger.debug('Dumping protein_groups data structure to ' + dump) parse.save_data_dict(protein_groups, dump) proteins = {} for i_group, protein_group in enumerate(protein_groups): descriptions = protein_group['protein description'].split(' / ') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning( "Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': protein_group['protein sequence'], 'attr': { 'coverage': protein_group['protein sequence coverage (%)'], 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides': [] }] } proteins[seqids[0]] = protein protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein unmatched_peptides = [] n_peptide_matched = 0 for src_peptide in peptides: descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break if protein is None: unmatched_peptides.append(src_peptide) continue n_peptide_matched += 1 sequence = protein['sequence'] peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] i = sequence.index(peptide_sequence) peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': src_peptide['scan number'], 'retention_time': parse.round_decimal(src_peptide['retention time (min)'], 4), 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), }, 'intensity': src_peptide['morpheus score'] / len(peptide_sequence), 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal( modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping proteins data structure to ' + dump) if logger.root.level <= logging.DEBUG: parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format( n_peptide_matched, len(unmatched_peptides))) return proteins
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} proteins = {} dict_dump_writer = DictListWriter( is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate( read_tsv_iter(protein_groups_fname)): descriptions = protein_group['protein description'].split(' / ') coverage_str = str(protein_group['protein sequence coverage (%)']) if ';' in coverage_str: coverage = float(get_first(coverage_str, ';')) else: coverage = float(get_first(coverage_str, '/')) seqs = protein_group['protein sequence'].split('/') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning( "Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': seqs[0], 'other_sequences': seqs[1:], 'attr': { 'coverage': parse.round_decimal(coverage, 4), 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides': [] }] } proteins[seqids[0]] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_peptide = 0 n_peptide_matched = 0 for src_peptide in read_tsv_iter(psm_fname): dict_dump_writer.dump_dict(src_peptide) descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break n_peptide += 1 if protein is None: continue n_peptide_matched += 1 sequence = protein['sequence'] extracted_peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] if extracted_peptide_sequence != peptide_sequence: logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence) i = sequence.find(peptide_sequence) if i < 0: logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid']) continue q_value = float(src_peptide['q-value (%)']) if 'scan number' in src_peptide: scan_id = src_peptide['scan number'] elif 'spectrum number' in src_peptide: scan_id = src_peptide['spectrum number'] else: scan_id = '' if 'retention time (min)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (min)'], 4) elif 'retention time (minutes)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (minutes)'], 4) else: time = '' peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': scan_id, 'retention_time': time, 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), 'q_value': q_value, }, 'intensity': 1.0 - q_value / 100.0, 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal( modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format( n_peptide_matched, n_peptide)) return proteins
def get_proteins(xtandem_fname, n_peak=50, good_expect=1E-8, cutoff_expect=1E-2): proteins = {} i_source = 0 print_scan = True for scan in read_xtandem(xtandem_fname): scan_id = scan['id'] x_vals = map(float, scan['masses'].split()) y_vals = map(float, scan['intensities'].split()) ions = [(x, y) for x, y in zip(x_vals, y_vals)] ions.sort(key=lambda i: -i[1]) for xtandem_match in scan['matches']: expect = xtandem_match['expect'] if cutoff_expect < expect: continue intensity = proteins_module.calc_minus_log_intensity( expect, good_expect, cutoff_expect) seqid = xtandem_match['seqid'] if seqid not in proteins: protein = proteins_module.new_protein(seqid) protein.update({ 'sequence': xtandem_match['sequence'], 'description': xtandem_match['description'], }) proteins[seqid] = protein protein = proteins[seqid] source = protein['sources'][i_source] match = { 'sequence': xtandem_match['seq'], 'intensity': intensity, 'modifications': [], 'spectrum': ions[:n_peak], 'attr': { 'scan_id': scan['id'], 'charge': scan['charge'], 'expect': expect, 'missed_cleavages': xtandem_match['missed_cleavages'], 'mass': scan['mass'], 'source': parse.basename(xtandem_fname), } } if xtandem_match['modifications']: for mod in xtandem_match['modifications']: i_mod_in_full_seq = int(mod['at']) - 1 full_seq = xtandem_match['sequence'] i_pep_seq = int(xtandem_match['start']) - 1 aa = mod['type'] if aa in peptidemass.aa_monoisotopic_mass: mass = peptidemass.aa_monoisotopic_mass[aa] else: mass = 0.0 match['modifications'].append({ 'i': i_mod_in_full_seq - i_pep_seq, 'mass': mod['modified'] + mass, }) source['matches'].append(match) proteins_module.calculate_peptide_positions(proteins) return proteins
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} proteins = {} dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate(read_tsv_iter(protein_groups_fname)): descriptions = protein_group['protein description'].split(' / ') coverage_str = str(protein_group['protein sequence coverage (%)']) if ';' in coverage_str: coverage = float(get_first(coverage_str, ';')) else: coverage = float(get_first(coverage_str, '/')) seqs = protein_group['protein sequence'].split('/') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning("Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': seqs[0], 'other_sequences': seqs[1:], 'attr': { 'coverage': parse.round_decimal(coverage, 4), 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides':[] }] } proteins[seqids[0]] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_peptide = 0 n_peptide_matched = 0 for src_peptide in read_tsv_iter(psm_fname): dict_dump_writer.dump_dict(src_peptide) descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break n_peptide += 1 if protein is None: continue n_peptide_matched += 1 sequence = protein['sequence'] extracted_peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] if extracted_peptide_sequence != peptide_sequence: logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence) i = sequence.find(peptide_sequence) if i < 0: logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid']) continue q_value = float(src_peptide['q-value (%)']) if 'scan number' in src_peptide: scan_id = src_peptide['scan number'] elif 'spectrum number' in src_peptide: scan_id = src_peptide['spectrum number'] else: scan_id = '' if 'retention time (min)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (min)'], 4) elif 'retention time (minutes)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (minutes)'], 4) else: time = '' peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': scan_id, 'retention_time': time, 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), 'q_value': q_value, }, 'intensity': 1.0 - q_value/100.0, 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal(modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_peptide_matched, n_peptide)) return proteins