def score(self, table): prepared_table, __ = prepare_data_table(table, score_columns=self.score_columns) texp = Experiment(prepared_table) score = self.classifier.score(texp, True) texp["d_score"] = (score - self.mu) / self.nu s_values, q_values = lookup_s_and_q_values_from_error_table(texp["d_score"].values, self.error_stat.df) texp["m_score"] = q_values texp["s_value"] = s_values logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values), np.std(q_values, ddof=1))) logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values), np.std(s_values, ddof=1))) texp.add_peak_group_rank() df = table.join(texp[["d_score", "m_score", "peak_group_rank"]]) if CONFIG.get("compute.probabilities"): df = self.add_probabilities(df, texp) if CONFIG.get("target.compress_results"): to_drop = [n for n in df.columns if n.startswith("var_") or n.startswith("main_")] df.drop(to_drop, axis=1, inplace=True) return df
def score(self, table): prepared_table, __ = prepare_data_table( table, score_columns=self.score_columns) texp = Experiment(prepared_table) score = self.classifier.score(texp, True) texp["d_score"] = (score - self.mu) / self.nu s_values, q_values = lookup_s_and_q_values_from_error_table( texp["d_score"].values, self.error_stat.df) texp["m_score"] = q_values texp["s_value"] = s_values logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values), np.std(q_values, ddof=1))) logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values), np.std(s_values, ddof=1))) texp.add_peak_group_rank() df = table.join(texp[["d_score", "m_score", "peak_group_rank"]]) if CONFIG.get("compute.probabilities"): df = self.add_probabilities(df, texp) if CONFIG.get("target.compress_results"): to_drop = [ n for n in df.columns if n.startswith("var_") or n.startswith("main_") ] df.drop(to_drop, axis=1, inplace=True) return df
def learn_and_apply_classifier(self, table): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() all_test_target_scores = [] all_test_decoy_scores = [] ws = [] neval = CONFIG.get("xeval.num_iter") inst = self.semi_supervised_learner num_processes = CONFIG.get("num_processes") logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) ws.append(w.flatten()) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((inst, "learn_randomized", (experiment, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] ws.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") final_classifier = self.semi_supervised_learner.averaged_learner(ws) result, data_for_persistence = self.apply_classifier( final_classifier, experiment, all_test_target_scores, all_test_decoy_scores, table) logging.info("calculated scoring and statistics") return result, data_for_persistence + (score_columns, )
def apply_loaded_scorer(self, table, loaded_scorer): final_classifier, mu, nu, df_raw_stat, num_null, num_total, loaded_score_columns = loaded_scorer prepared_table, __ = prepare_data_table(table, loaded_score_columns=loaded_score_columns) experiment = Experiment(prepared_table) final_score = final_classifier.score(experiment, True) experiment["d_score"] = (final_score - mu) / nu scored_table = self.enrich_table_with_results(table, experiment, df_raw_stat) trained_weights = final_classifier.get_coefs() return (None, None, scored_table), None, None, trained_weights
def apply_loaded_scorer(self, table, loaded_scorer): final_classifier, mu, nu, df_raw_stat, loaded_score_columns = loaded_scorer prepared_table, __ = prepare_data_table( table, loaded_score_columns=loaded_score_columns) experiment = Experiment(prepared_table) final_score = final_classifier.score(experiment, True) experiment["d_score"] = (final_score - mu) / nu scored_table = self.enrich_table_with_results(table, experiment, df_raw_stat) return (None, None, scored_table), None
def tutor_and_apply_classifier(self, table, p_score=False, loaded_weights=None): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() all_test_target_scores = [] all_test_decoy_scores = [] clfs = [] ws = [] # are ws and clfs redundant? train_frac = CONFIG.get("train.fraction") is_test = CONFIG.get("is_test", False) neval = CONFIG.get("xeval.num_iter") fraction = CONFIG.get("xeval.fraction") teacher = self.semi_supervised_teacher # inst num_processes = CONFIG.get("num_processes") # reserve part of experiment for testing and FDR calc. experiment.split_train_test(train_frac, is_test) train_exp = experiment test_exp = None if train_frac < 0.99: train_exp, test_exp = experiment.get_train_and_test_peaks() xval_type = CONFIG.get("xval.type") if xval_type == "split": train_exp.set_xval_sets(neval, is_test) xval_sets = xval_sets(neval, int(fraction * neval + 0.5)) if loaded_weights == None: logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): if xval_type == "split": train_exp.train_on_xval_sets(xval_sets[k]) else: train_exp.split_train_test(fraction, is_test) (ttt_scores, ttd_scores, clf) = teacher.tutor_randomized(train_exp) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) clfs.append(clf) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((teacher, "tutor_randomized", (train_exp, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] clfs.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") else: logging.info("start application of pretrained weights") loaded_clf = LinearPredictor(loaded_weights) clfs.append(loaded_clf) clf_scores = loaded_clf.score(experiment, True) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"]) all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"]) logging.info("finished pretrained scoring") final_classifier = ConsensusPredictor(clfs) # TODO: How to solve this for general (non-linear) predictors? # ... maybe just ignore for now loaded_weights = final_classifier.get_coefs() d = {} d["tg_id"] = experiment.df.tg_num_id.values d["decoy"] = experiment.df.is_decoy.values for i in range(len(clfs)): s = clfs[i].score(experiment, True) experiment.set_and_rerank("classifier_score", s) d["clf%d_score" % i] = s.flatten() d["clf%d_rank1" % i] = experiment.df.is_top_peak.values for c in score_columns: d[c] = table[c] results, res_dict, data_for_persistence = self.apply_classifier(final_classifier, experiment, test_exp, all_test_target_scores, all_test_decoy_scores, table, p_score=p_score) logging.info("calculated scoring and statistics") return results, pd.DataFrame(d), data_for_persistence + (score_columns,), loaded_weights
def learn_and_apply_classifier(self, table, p_score=False, loaded_weights=None): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() inst = self.semi_supervised_learner ws = [] neval = CONFIG.get("xeval.num_iter") num_processes = CONFIG.get("num_processes") all_test_target_scores = [] all_test_decoy_scores = [] if loaded_weights == None: logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) ws.append(w.flatten()) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((inst, "learn_randomized", (experiment, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] ws.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") else: logging.info("start application of pretrained weights") ws.append(loaded_weights.flatten()) clf_scores = inst.score(experiment, loaded_weights) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"]) all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"]) logging.info("finished pretrained scoring") final_classifier = self.semi_supervised_learner.averaged_learner(ws) loaded_weights = final_classifier.get_parameters() result, data_for_persistence = self.apply_classifier(final_classifier, experiment, all_test_target_scores, all_test_decoy_scores, table, p_score=p_score) logging.info("calculated scoring and statistics") return result, data_for_persistence + (score_columns,), loaded_weights