def find_shitty_decoys(self): """ Finds and notes decoys that share their sequence with a target PSM. Also counts the number of targets and decoys to get a quick estimate of how many positive/negative training examples can be "claimed". """ target_seqs = set() decoy_seqs = set() with open(self.csv_path, "r") as f: reader = csv.DictReader(f) sorted_reader = sorted( reader, reverse=self["bigger_scores_better"], key=lambda d: float(d[self.col_for_sorting]), ) for row in sorted_reader: self.observed_charges.add(int(row["Charge"])) if row_is_decoy(row): decoy_seqs.add(unify_sequence(row["Sequence"])) self.counter["decoy"] += 1 else: target_seqs.add(unify_sequence(row["Sequence"])) self.counter["target"] += 1 self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs) if len(self.shitty_decoy_seqs) > 0: print( "Warning! Found {0} sequences that are target AND decoy " "(immutable peptides?). These will not be used for training.\n" .format(len(self.shitty_decoy_seqs))) return
def find_shitty_decoys(self): ''' Finds and notes decoys that share their sequence with a target PSM. Also counts the number of targets and decoys to get a quick estimate of how many positive/negative training examples can be "claimed". ''' target_seqs = set() decoy_seqs = set() with open(self.csv_path, 'r') as f: reader = csv.DictReader(f) sorted_reader = sorted( reader, reverse=self['bigger_scores_better'], key=lambda d: float(d[self.col_for_sorting]) ) for row in sorted_reader: self.observed_charges.add(int(row['Charge'])) if row_is_decoy(row): decoy_seqs.add(unify_sequence(row['Sequence'])) self.counter['decoy'] += 1 else: target_seqs.add(unify_sequence(row['Sequence'])) self.counter['target'] += 1 self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs) if len(self.shitty_decoy_seqs) > 0: print( 'Warning! Found {0} sequences that are target AND decoy '\ '(immutable peptides?). These will not be used for training.\n'.format(len(self.shitty_decoy_seqs)) ) return
def get_psm_category(self, row): """ Determines whether a PSM (csv row) should be used as a negative or positive training example. returns 1 - high-scoring target (positive training example) 0 - not-high-scoring target (not usable for training) -1 - decoy (negative training example) """ category = 0 # unknown (mix of true positives and false positives) self.PSM_count += 1 # for FDR calculation sequence = unify_sequence(row["Sequence"]) psm_FDR = calc_FDR(self.PSM_count, self.decoy_count) if row_is_decoy(row): self.decoy_count += 1 if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs: category = -1 # decoy (false positive hits) self.counter["negative"] += 1 else: if not self.decoy_train_prob: need_max = self.counter["positive"] * 2 have = self.counter["negative"] still_there = self.counter["decoy"] - have prob = need_max / still_there if prob < 0.001: prob = 0.001 self.decoy_train_prob = prob print() print(self.counter) print("need max:", need_max) print("have:", have) print("still_there:", still_there) print("probability:", self.decoy_train_prob) print() if self.decoy_train_prob >= 1.0 or random( ) <= self.decoy_train_prob: category = -1 # decoy (false positive hits) self.counter["negative"] += 1 else: # row is target if psm_FDR <= self[ "fdr_cutoff"] and sequence not in self.shitty_decoy_seqs: category = 1 # high quality target (almost certainly true positives) self.counter["positive"] += 1 if category == 0: self.counter["unknown"] += 1 return (category, psm_FDR)
def get_psm_category(self, row): ''' Determines whether a PSM (csv row) should be used as a negative or positive training example. returns 1 - high-scoring target (positive training example) 0 - not-high-scoring target (not usable for training) -1 - decoy (negative training example) ''' category = 0 # unknown (mix of true positives and false positives) self.PSM_count += 1 # for FDR calculation sequence = unify_sequence(row['Sequence']) psm_FDR = calc_FDR(self.PSM_count, self.decoy_count) if row_is_decoy(row): self.decoy_count += 1 if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs: category = -1 # decoy (false positive hits) self.counter['negative'] += 1 else: if not self.decoy_train_prob: need_max = self.counter['positive'] * 2 have = self.counter['negative'] still_there = self.counter['decoy'] - have prob = need_max / still_there if prob < 0.001: prob = 0.001 self.decoy_train_prob = prob print() print(self.counter) print('need max:', need_max) print('have:', have) print('still_there:', still_there) print('probability:', self.decoy_train_prob) print() if self.decoy_train_prob >= 1.0 or random() <= self.decoy_train_prob: category = -1 # decoy (false positive hits) self.counter['negative'] += 1 else: # row is target if psm_FDR <= self['fdr_cutoff'] and sequence not in self.shitty_decoy_seqs: category = 1 # high quality target (almost certainly true positives) self.counter['positive'] += 1 if category == 0: self.counter['unknown'] += 1 return (category, psm_FDR)
def row_to_features(self, row): """ Converts a unified CSV row to a SVM feature matrix (numbers only!) """ sequence = unify_sequence(row["Sequence"]) charge = field_to_float(row["Charge"]) score = field_to_bayes_float(row[self.col_for_sorting]) calc_mz, exp_mz, calc_mass, exp_mass = get_mz_values(row) # calc_mz = field_to_float( row['Calc m/z'] ) # calc m/z or uCalc? # exp_mz = field_to_float( row['Exp m/z'] ) pre_aa_field = row["Sequence Pre AA"] post_aa_field = row["Sequence Post AA"] all_pre_aas = set(re.split(self.delim_regex, pre_aa_field)) all_post_aas = set(re.split(self.delim_regex, post_aa_field)) if any(pre_aa not in self.tryptic_aas for pre_aa in all_pre_aas): enzN = 0 else: enzN = 1 if any(post_aa not in self.tryptic_aas for post_aa in all_post_aas): enzC = 0 else: enzC = 1 n_missed_cleavages = len([ aa for aa in sequence[:-1] if aa in ["R", "K"] ]) # / len(sequence) missed_cleavages = [0] * 6 try: missed_cleavages[n_missed_cleavages] = 1 except IndexError: # if a peptide has more than 6 missed cleavages missed_cleavages[-1] = 2 spectrum = row["Spectrum Title"].strip() mass = (exp_mz * charge) - (charge - 1) * PROTON pep_len = len(sequence) # delta_mz = calc_mz - exp_mz delta_mass = calc_mass - exp_mass peptide = (sequence, row["Modifications"]) proteins = self.parse_protein_ids(row["Protein ID"]) num_pep = self.num_pep[peptide] pep_charge_states = len(self.pep_charge_states[peptide]) seq_mods = len(self.seq_mods[sequence]) num_spec = len(self.num_spec[row["Spectrum Title"]]) num_prot = sum((len(self.num_prot[protein]) for protein in proteins)) pep_site = sum((len(self.pep_site[protein]) for protein in proteins)) user_specified_features = [] for feat in self.used_extra_fields: if feat != self.col_for_sorting: try: user_specified_features.append(field_to_float(row[feat])) except ValueError: pass charges = defaultdict(int) for charge_n in sorted(self.pep_charge_states[peptide]): charges[charge_n] = 1 if sequence in self.shitty_decoy_seqs: is_shitty = 1 else: is_shitty = 0 score_list = sorted( list(set(self.score_list_dict[spectrum])), reverse=self["bigger_scores_better"], ) try: score_list_scaled = scale_scores(score_list) rank = score_list.index(score) deltLCn = ( score_list_scaled[rank] - score_list_scaled[1] ) # Fractional difference between current and second best XCorr deltCn = ( score_list_scaled[rank] - score_list_scaled[-1] ) # Fractional difference between current and worst XCorr except (ValueError, IndexError, AssertionError): # NaN values will be replaced by the column mean later # NaN values are entered when there is no ranking # e.g. when only one peptide was matched to the spectrum. rank, deltLCn, deltCn = np.nan, np.nan, np.nan features = [ score, rank, deltCn, deltLCn, charge, # delta_mz,# / pep_len, delta_mass, # / pep_len, # abs(delta_mz),# / pep_len, abs(delta_mass), # / pep_len, n_missed_cleavages / pep_len, missed_cleavages[0], missed_cleavages[1], missed_cleavages[2], missed_cleavages[3], missed_cleavages[4], missed_cleavages[5], enzN, enzC, mass, pep_len, num_pep, num_prot, pep_site, is_shitty, pep_charge_states, num_spec, seq_mods, ] for charge_n in self.observed_charges: features.append(charges[charge_n]) return features + user_specified_features
def count_intra_set_features(self): """ intra-set features as calculated by Percolator: - num_pep: Number of PSMs for which this is the best scoring peptide. - num_prot: Number of times the matched protein matches other PSMs. - pep_site: Number of different peptides that match this protein. own ideas: - pep_charge_states: in how many charge states was the peptide found? - seq_mods: in how many mod states was the AA-sequence found? - num_spec: Number of times the matched spectrum matches other peptides. """ print("Counting intra-set features...") self.num_pep = defaultdict(int) self.num_prot = defaultdict(set) self.pep_site = defaultdict(set) self.score_list_dict = defaultdict(list) self.pep_charge_states = defaultdict(set) self.seq_mods = defaultdict(set) self.num_spec = defaultdict(set) with open(self.csv_path, "r") as f: reader = csv.DictReader(f) previous_spec_title = None rows_of_spectrum = [] for row in sorted(reader, reverse=self["bigger_scores_better"], key=self.sort_by_rank): if unify_sequence(row["Sequence"]) in self.shitty_decoy_seqs: continue current_spec_title = row["Spectrum Title"] if current_spec_title != previous_spec_title: # the next spectrum started, so let's process the info we # collected for the previous spectrum: score_list = [ field_to_bayes_float(r[self.col_for_sorting]) for r in rows_of_spectrum ] self.score_list_dict[previous_spec_title] = score_list for rank, line in enumerate(rows_of_spectrum): # print("\t".join([ # str(rank), line['Spectrum Title'], line[self.col_for_sorting] # ])) uni_sequence = unify_sequence(line["Sequence"]) peptide = (uni_sequence, line["Modifications"]) # multiple proteins are separated by <|> # ignore start_stop_pre_post part since it depends on the peptide # and not the protein (i.e. _233_243_A_R) proteins = set(line["Protein ID"].replace( "decoy_", "").split(";")) # old unify csv format: # proteins = self.parse_protein_ids( # line['proteinacc_start_stop_pre_post_;'] # ) if len(proteins) > self.maximum_proteins_per_line: self.maximum_proteins_per_line = len(proteins) if rank == 0: # this is the 'best' peptide for that spectrum self.num_pep[peptide] += 1 for protein in proteins: self.num_prot[protein].add(( line["Spectrum Title"], uni_sequence, line["Modifications"], )) self.pep_site[protein].add(peptide) self.pep_charge_states[peptide].add(int(row["Charge"])) self.seq_mods[uni_sequence].add(row["Modifications"]) self.num_spec[line["Spectrum Title"]].add(peptide) rows_of_spectrum = [] rows_of_spectrum.append(row) previous_spec_title = current_spec_title
def row_to_features(self, row): ''' Converts a unified CSV row to a SVM feature matrix (numbers only!) ''' sequence = unify_sequence(row['Sequence']) charge = field_to_float( row['Charge'] ) score = field_to_bayes_float( row[self.col_for_sorting] ) calc_mz, exp_mz, calc_mass, exp_mass = get_mz_values(row) #calc_mz = field_to_float( row['Calc m/z'] ) # calc m/z or uCalc? #exp_mz = field_to_float( row['Exp m/z'] ) pre_aa_field = row['Sequence Pre AA'] post_aa_field = row['Sequence Post AA'] all_pre_aas = set(re.split(self.delim_regex, pre_aa_field)) all_post_aas = set(re.split(self.delim_regex, post_aa_field)) if any(pre_aa not in self.tryptic_aas for pre_aa in all_pre_aas): enzN = 0 else: enzN = 1 if any(post_aa not in self.tryptic_aas for post_aa in all_post_aas): enzC = 0 else: enzC = 1 n_missed_cleavages = len([aa for aa in sequence[:-1] if aa in ['R', 'K']]) # / len(sequence) missed_cleavages = [0] * 6 try: missed_cleavages[n_missed_cleavages] = 1 except IndexError: # if a peptide has more than 6 missed cleavages missed_cleavages[-1] = 2 spectrum = row['Spectrum Title'].strip() mass = (exp_mz * charge) - (charge - 1) * PROTON pep_len = len(sequence) #delta_mz = calc_mz - exp_mz delta_mass = calc_mass - exp_mass peptide = (sequence, row['Modifications']) proteins = self.parse_protein_ids( row['Protein ID'] ) num_pep = self.num_pep[peptide] pep_charge_states = len(self.pep_charge_states[peptide]) seq_mods = len(self.seq_mods[sequence]) num_spec = len(self.num_spec[row['Spectrum Title']]) num_prot = sum( (len(self.num_prot[protein]) for protein in proteins) ) pep_site = sum( (len(self.pep_site[protein]) for protein in proteins) ) user_specified_features = [] for feat in self.used_extra_fields: if feat != self.col_for_sorting: try: user_specified_features.append(field_to_float(row[feat])) except ValueError: pass charges = defaultdict(int) for charge_n in sorted(self.pep_charge_states[peptide]): charges[charge_n] = 1 if sequence in self.shitty_decoy_seqs: is_shitty = 1 else: is_shitty = 0 score_list = sorted( list(set(self.score_list_dict[spectrum])), reverse=self['bigger_scores_better'] ) try: score_list_scaled = scale_scores(score_list) rank = score_list.index(score) deltLCn = score_list_scaled[rank] - score_list_scaled[ 1] # Fractional difference between current and second best XCorr deltCn = score_list_scaled[rank] - score_list_scaled[-1] # Fractional difference between current and worst XCorr except (ValueError, IndexError, AssertionError): # NaN values will be replaced by the column mean later # NaN values are entered when there is no ranking # e.g. when only one peptide was matched to the spectrum. rank, deltLCn, deltCn = np.nan, np.nan, np.nan features = [ score, rank, deltCn, deltLCn, charge, #delta_mz,# / pep_len, delta_mass,# / pep_len, #abs(delta_mz),# / pep_len, abs(delta_mass),# / pep_len, n_missed_cleavages / pep_len, missed_cleavages[0], missed_cleavages[1], missed_cleavages[2], missed_cleavages[3], missed_cleavages[4], missed_cleavages[5], enzN, enzC, mass, pep_len, num_pep, num_prot, pep_site, is_shitty, pep_charge_states, num_spec, seq_mods, ] for charge_n in self.observed_charges: features.append(charges[charge_n]) return features + user_specified_features
def count_intra_set_features(self): ''' intra-set features as calculated by Percolator: - num_pep: Number of PSMs for which this is the best scoring peptide. - num_prot: Number of times the matched protein matches other PSMs. - pep_site: Number of different peptides that match this protein. own ideas: - pep_charge_states: in how many charge states was the peptide found? - seq_mods: in how many mod states was the AA-sequence found? - num_spec: Number of times the matched spectrum matches other peptides. ''' print('Counting intra-set features...') self.num_pep = defaultdict(int) self.num_prot = defaultdict(set) self.pep_site = defaultdict(set) self.score_list_dict = defaultdict(list) self.pep_charge_states = defaultdict(set) self.seq_mods = defaultdict(set) self.num_spec = defaultdict(set) with open(self.csv_path, 'r') as f: reader = csv.DictReader(f) previous_spec_title = None rows_of_spectrum = [] for row in sorted( reader, reverse=self['bigger_scores_better'], key=self.sort_by_rank ): if unify_sequence(row['Sequence']) in self.shitty_decoy_seqs: continue current_spec_title = row['Spectrum Title'] if current_spec_title != previous_spec_title: # the next spectrum started, so let's process the info we # collected for the previous spectrum: score_list = [field_to_bayes_float(r[self.col_for_sorting]) for r in rows_of_spectrum] self.score_list_dict[previous_spec_title] = score_list for rank, line in enumerate(rows_of_spectrum): #print("\t".join([ #str(rank), line['Spectrum Title'], line[self.col_for_sorting] #])) uni_sequence = unify_sequence(line['Sequence']) peptide = (uni_sequence, line['Modifications']) # multiple proteins are separated by <|> # ignore start_stop_pre_post part since it depends on the peptide # and not the protein (i.e. _233_243_A_R) proteins = set(line['Protein ID'].replace('decoy_', '').split(';')) #old unify csv format: #proteins = self.parse_protein_ids( # line['proteinacc_start_stop_pre_post_;'] #) if len(proteins) > self.maximum_proteins_per_line: self.maximum_proteins_per_line = len(proteins) if rank == 0: # this is the 'best' peptide for that spectrum self.num_pep[peptide] += 1 for protein in proteins: self.num_prot[protein].add( (line['Spectrum Title'], uni_sequence, line['Modifications']) ) self.pep_site[protein].add(peptide) self.pep_charge_states[peptide].add(int(row['Charge'])) self.seq_mods[uni_sequence].add(row['Modifications']) self.num_spec[line['Spectrum Title']].add(peptide) rows_of_spectrum = [] rows_of_spectrum.append(row) previous_spec_title = current_spec_title