def _learn_and_apply_out_of_core(self, pathes, delim): sampling_rate = CONFIG.get("out_of_core.sampling_rate") assert 0 < sampling_rate <= 1.0, "invalid sampling rate value" prepared_tables, score_columns = sample_data_tables(pathes, delim, sampling_rate) prepared_table = pd.concat(prepared_tables) experiment = Experiment(prepared_table) experiment.log_summary() final_classifier, all_test_target_scores, all_test_decoy_scores = self._learn(experiment) return self._build_lazy_result(pathes, final_classifier, score_columns, experiment, all_test_target_scores, all_test_decoy_scores)
def _learn_and_apply_out_of_core(self, pathes, delim): sampling_rate = CONFIG.get("out_of_core.sampling_rate") assert 0 < sampling_rate <= 1.0, "invalid sampling rate value" prepared_tables, score_columns = sample_data_tables( pathes, delim, sampling_rate) prepared_table = pd.concat(prepared_tables) experiment = Experiment(prepared_table) experiment.log_summary() final_classifier, all_test_target_scores, all_test_decoy_scores = self._learn( experiment) return self._build_lazy_result(pathes, final_classifier, score_columns, experiment, all_test_target_scores, all_test_decoy_scores)
def learn_and_apply_classifier(self, table): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() all_test_target_scores = [] all_test_decoy_scores = [] ws = [] neval = CONFIG.get("xeval.num_iter") inst = self.semi_supervised_learner num_processes = CONFIG.get("num_processes") logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) ws.append(w.flatten()) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((inst, "learn_randomized", (experiment, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] ws.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") final_classifier = self.semi_supervised_learner.averaged_learner(ws) result, data_for_persistence = self.apply_classifier( final_classifier, experiment, all_test_target_scores, all_test_decoy_scores, table) logging.info("calculated scoring and statistics") return result, data_for_persistence + (score_columns, )
def _setup_experiment(self, tables): prepared_tables, score_columns = prepare_data_tables(tables) prepared_table = pd.concat(prepared_tables) experiment = Experiment(prepared_table) experiment.log_summary() return experiment, score_columns
def tutor_and_apply_classifier(self, table, p_score=False, loaded_weights=None): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() all_test_target_scores = [] all_test_decoy_scores = [] clfs = [] ws = [] # are ws and clfs redundant? train_frac = CONFIG.get("train.fraction") is_test = CONFIG.get("is_test", False) neval = CONFIG.get("xeval.num_iter") fraction = CONFIG.get("xeval.fraction") teacher = self.semi_supervised_teacher # inst num_processes = CONFIG.get("num_processes") # reserve part of experiment for testing and FDR calc. experiment.split_train_test(train_frac, is_test) train_exp = experiment test_exp = None if train_frac < 0.99: train_exp, test_exp = experiment.get_train_and_test_peaks() xval_type = CONFIG.get("xval.type") if xval_type == "split": train_exp.set_xval_sets(neval, is_test) xval_sets = xval_sets(neval, int(fraction * neval + 0.5)) if loaded_weights == None: logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): if xval_type == "split": train_exp.train_on_xval_sets(xval_sets[k]) else: train_exp.split_train_test(fraction, is_test) (ttt_scores, ttd_scores, clf) = teacher.tutor_randomized(train_exp) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) clfs.append(clf) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((teacher, "tutor_randomized", (train_exp, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] clfs.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") else: logging.info("start application of pretrained weights") loaded_clf = LinearPredictor(loaded_weights) clfs.append(loaded_clf) clf_scores = loaded_clf.score(experiment, True) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"]) all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"]) logging.info("finished pretrained scoring") final_classifier = ConsensusPredictor(clfs) # TODO: How to solve this for general (non-linear) predictors? # ... maybe just ignore for now loaded_weights = final_classifier.get_coefs() d = {} d["tg_id"] = experiment.df.tg_num_id.values d["decoy"] = experiment.df.is_decoy.values for i in range(len(clfs)): s = clfs[i].score(experiment, True) experiment.set_and_rerank("classifier_score", s) d["clf%d_score" % i] = s.flatten() d["clf%d_rank1" % i] = experiment.df.is_top_peak.values for c in score_columns: d[c] = table[c] results, res_dict, data_for_persistence = self.apply_classifier(final_classifier, experiment, test_exp, all_test_target_scores, all_test_decoy_scores, table, p_score=p_score) logging.info("calculated scoring and statistics") return results, pd.DataFrame(d), data_for_persistence + (score_columns,), loaded_weights
def learn_and_apply_classifier(self, table, p_score=False, loaded_weights=None): prepared_table, score_columns = prepare_data_table(table) experiment = Experiment(prepared_table) is_test = CONFIG.get("is_test", False) if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) experiment.log_summary() inst = self.semi_supervised_learner ws = [] neval = CONFIG.get("xeval.num_iter") num_processes = CONFIG.get("num_processes") all_test_target_scores = [] all_test_decoy_scores = [] if loaded_weights == None: logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) ws.append(w.flatten()) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((inst, "learn_randomized", (experiment, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) top_test_target_scores = [ti for r in res for ti in r[0]] top_test_decoy_scores = [ti for r in res for ti in r[1]] ws.extend([r[2] for r in res]) all_test_target_scores.extend(top_test_target_scores) all_test_decoy_scores.extend(top_test_decoy_scores) logging.info("finished cross evals") else: logging.info("start application of pretrained weights") ws.append(loaded_weights.flatten()) clf_scores = inst.score(experiment, loaded_weights) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"]) all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"]) logging.info("finished pretrained scoring") final_classifier = self.semi_supervised_learner.averaged_learner(ws) loaded_weights = final_classifier.get_parameters() result, data_for_persistence = self.apply_classifier(final_classifier, experiment, all_test_target_scores, all_test_decoy_scores, table, p_score=p_score) logging.info("calculated scoring and statistics") return result, data_for_persistence + (score_columns,), loaded_weights