def find_shitty_decoys(self): """ Finds and notes decoys that share their sequence with a target PSM. Also counts the number of targets and decoys to get a quick estimate of how many positive/negative training examples can be "claimed". """ target_seqs = set() decoy_seqs = set() with open(self.csv_path, "r") as f: reader = csv.DictReader(f) sorted_reader = sorted( reader, reverse=self["bigger_scores_better"], key=lambda d: float(d[self.col_for_sorting]), ) for row in sorted_reader: self.observed_charges.add(int(row["Charge"])) if row_is_decoy(row): decoy_seqs.add(unify_sequence(row["Sequence"])) self.counter["decoy"] += 1 else: target_seqs.add(unify_sequence(row["Sequence"])) self.counter["target"] += 1 self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs) if len(self.shitty_decoy_seqs) > 0: print( "Warning! Found {0} sequences that are target AND decoy " "(immutable peptides?). These will not be used for training.\n" .format(len(self.shitty_decoy_seqs))) return
def find_shitty_decoys(self): ''' Finds and notes decoys that share their sequence with a target PSM. Also counts the number of targets and decoys to get a quick estimate of how many positive/negative training examples can be "claimed". ''' target_seqs = set() decoy_seqs = set() with open(self.csv_path, 'r') as f: reader = csv.DictReader(f) sorted_reader = sorted( reader, reverse=self['bigger_scores_better'], key=lambda d: float(d[self.col_for_sorting]) ) for row in sorted_reader: self.observed_charges.add(int(row['Charge'])) if row_is_decoy(row): decoy_seqs.add(unify_sequence(row['Sequence'])) self.counter['decoy'] += 1 else: target_seqs.add(unify_sequence(row['Sequence'])) self.counter['target'] += 1 self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs) if len(self.shitty_decoy_seqs) > 0: print( 'Warning! Found {0} sequences that are target AND decoy '\ '(immutable peptides?). These will not be used for training.\n'.format(len(self.shitty_decoy_seqs)) ) return
def get_psm_category(self, row): """ Determines whether a PSM (csv row) should be used as a negative or positive training example. returns 1 - high-scoring target (positive training example) 0 - not-high-scoring target (not usable for training) -1 - decoy (negative training example) """ category = 0 # unknown (mix of true positives and false positives) self.PSM_count += 1 # for FDR calculation sequence = unify_sequence(row["Sequence"]) psm_FDR = calc_FDR(self.PSM_count, self.decoy_count) if row_is_decoy(row): self.decoy_count += 1 if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs: category = -1 # decoy (false positive hits) self.counter["negative"] += 1 else: if not self.decoy_train_prob: need_max = self.counter["positive"] * 2 have = self.counter["negative"] still_there = self.counter["decoy"] - have prob = need_max / still_there if prob < 0.001: prob = 0.001 self.decoy_train_prob = prob print() print(self.counter) print("need max:", need_max) print("have:", have) print("still_there:", still_there) print("probability:", self.decoy_train_prob) print() if self.decoy_train_prob >= 1.0 or random( ) <= self.decoy_train_prob: category = -1 # decoy (false positive hits) self.counter["negative"] += 1 else: # row is target if psm_FDR <= self[ "fdr_cutoff"] and sequence not in self.shitty_decoy_seqs: category = 1 # high quality target (almost certainly true positives) self.counter["positive"] += 1 if category == 0: self.counter["unknown"] += 1 return (category, psm_FDR)
def get_psm_category(self, row): ''' Determines whether a PSM (csv row) should be used as a negative or positive training example. returns 1 - high-scoring target (positive training example) 0 - not-high-scoring target (not usable for training) -1 - decoy (negative training example) ''' category = 0 # unknown (mix of true positives and false positives) self.PSM_count += 1 # for FDR calculation sequence = unify_sequence(row['Sequence']) psm_FDR = calc_FDR(self.PSM_count, self.decoy_count) if row_is_decoy(row): self.decoy_count += 1 if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs: category = -1 # decoy (false positive hits) self.counter['negative'] += 1 else: if not self.decoy_train_prob: need_max = self.counter['positive'] * 2 have = self.counter['negative'] still_there = self.counter['decoy'] - have prob = need_max / still_there if prob < 0.001: prob = 0.001 self.decoy_train_prob = prob print() print(self.counter) print('need max:', need_max) print('have:', have) print('still_there:', still_there) print('probability:', self.decoy_train_prob) print() if self.decoy_train_prob >= 1.0 or random() <= self.decoy_train_prob: category = -1 # decoy (false positive hits) self.counter['negative'] += 1 else: # row is target if psm_FDR <= self['fdr_cutoff'] and sequence not in self.shitty_decoy_seqs: category = 1 # high quality target (almost certainly true positives) self.counter['positive'] += 1 if category == 0: self.counter['unknown'] += 1 return (category, psm_FDR)
def collect_data(self): """ parses a unified csv file and collects features from each row """ categories = [] list_of_feature_lists = [] feature_sets = set() with open(self.csv_path, "r") as f: reader = csv.DictReader(f) # collecting some stats for FDR calculation: self.PSM_count = 0 self.decoy_count = 0 if self["dump_svm_matrix"]: self.init_svm_matrix_dump() additional_matrix_info = [] for i, row in enumerate( sorted( reader, reverse=self["bigger_scores_better"], key=lambda d: float(d[self.col_for_sorting]), )): features = self.row_to_features(row) if tuple(features) in feature_sets: continue feature_sets.add(tuple(features)) category, psm_FDR = self.get_psm_category(row) list_of_feature_lists.append(features) categories.append(category) if self["dump_svm_matrix"]: label = -1 if row_is_decoy(row) else 1 sequence = "{0}.{1}#{2}.{3}".format( row["Sequence Pre AA"].strip(), row["Sequence"].strip(), row["Modifications"].strip(), row["Sequence Post AA"].strip(), ) additional_matrix_info.append({ "psm_id": row["Spectrum Title"].strip(), "label": label, "scannr": row["Spectrum Title"].strip().split(".")[-2], "peptide": sequence, "proteins": self.parse_protein_ids(row["Protein ID"]), }) if i % 1000 == 0: score_val = float(row[self.col_for_sorting]) msg = ("Generating feature matrix from input csv " "(line ~{0}) with score {1} and FDR " "{2}".format(i, score_val, psm_FDR)) print(msg, end="\r") # All data points are collected in one big matrix, to make standardization possible print("\nConverting feature matrix to NumPy array...") X_raw = np.array(list_of_feature_lists, dtype=float) print("Replacing empty/NaN values with the mean of each column...") self.nan_replacer = Imputer() self.nan_replacer.fit(X_raw) X_raw = self.nan_replacer.transform(X_raw) # Standardize input matrix to ease machine learning! Scaled data has zero mean and unit variance print("Standardizing input matrix...") self.scaler = SCALER.fit(X_raw) self.X = self.scaler.transform(X_raw) self.categories = np.array(categories) print() if self["dump_svm_matrix"]: print("Dumping SVM matrix to", self["dump_svm_matrix"]) for i, matrix_row in enumerate(self.X): matrix_row_info = additional_matrix_info[i] self.dump_svm_matrix_row( row=list(matrix_row), psm_id=matrix_row_info["psm_id"], label=matrix_row_info["label"], scannr=matrix_row_info["scannr"], peptide=matrix_row_info["peptide"], proteins=matrix_row_info["proteins"], ) print("Dumped SVM matrix to", self["dump_svm_matrix"]) return
def collect_data(self): ''' parses a unified csv file and collects features from each row ''' categories = [] list_of_feature_lists = [] feature_sets = set() with open(self.csv_path, 'r') as f: reader = csv.DictReader(f) # collecting some stats for FDR calculation: self.PSM_count = 0 self.decoy_count = 0 if self['dump_svm_matrix']: self.init_svm_matrix_dump() additional_matrix_info = [] for i, row in enumerate( sorted(reader, reverse=self['bigger_scores_better'], key=lambda d: float(d[self.col_for_sorting]) )): features = self.row_to_features(row) if tuple(features) in feature_sets: continue feature_sets.add(tuple(features)) category, psm_FDR = self.get_psm_category(row) list_of_feature_lists.append(features) categories.append(category) if self['dump_svm_matrix']: label = -1 if row_is_decoy(row) else 1 sequence = '{0}.{1}#{2}.{3}'.format( row['Sequence Pre AA'].strip(), row['Sequence'].strip(), row['Modifications'].strip(), row['Sequence Post AA'].strip(), ) additional_matrix_info.append({ 'psm_id': row['Spectrum Title'].strip(), 'label': label, 'scannr': row['Spectrum Title'].strip().split('.')[-2], 'peptide': sequence, 'proteins': self.parse_protein_ids(row['Protein ID']), }) if i % 1000 == 0: score_val = float(row[self.col_for_sorting]) msg = 'Generating feature matrix from input csv '\ '(line ~{0}) with score {1} and FDR '\ '{2}'.format(i, score_val, psm_FDR) print(msg, end = '\r') # All data points are collected in one big matrix, to make standardization possible print('\nConverting feature matrix to NumPy array...') X_raw = np.array(list_of_feature_lists, dtype=float) print('Replacing empty/NaN values with the mean of each column...') self.nan_replacer = Imputer() self.nan_replacer.fit(X_raw) X_raw = self.nan_replacer.transform(X_raw) # Standardize input matrix to ease machine learning! Scaled data has zero mean and unit variance print('Standardizing input matrix...') self.scaler = SCALER.fit(X_raw) self.X = self.scaler.transform(X_raw) self.categories = np.array(categories) print() if self['dump_svm_matrix']: print('Dumping SVM matrix to', self['dump_svm_matrix']) for i, matrix_row in enumerate(self.X): matrix_row_info = additional_matrix_info[i] self.dump_svm_matrix_row( row = list(matrix_row), psm_id=matrix_row_info['psm_id'], label=matrix_row_info['label'], scannr=matrix_row_info['scannr'], peptide=matrix_row_info['peptide'], proteins=matrix_row_info['proteins'], ) print('Dumped SVM matrix to', self['dump_svm_matrix']) return