Пример #1
0
    def score(self, table):

        prepared_table, __ = prepare_data_table(table, score_columns=self.score_columns)
        texp = Experiment(prepared_table)
        score = self.classifier.score(texp, True)
        texp["d_score"] = (score - self.mu) / self.nu

        s_values, q_values = lookup_s_and_q_values_from_error_table(texp["d_score"].values,
                                                                    self.error_stat.df)
        texp["m_score"] = q_values
        texp["s_value"] = s_values
        logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values),
                                                                  np.std(q_values, ddof=1)))
        logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values),
                                                                  np.std(s_values, ddof=1)))
        texp.add_peak_group_rank()

        df = table.join(texp[["d_score", "m_score", "peak_group_rank"]])

        if CONFIG.get("compute.probabilities"):
            df = self.add_probabilities(df, texp)

        if CONFIG.get("target.compress_results"):
            to_drop = [n for n in df.columns if n.startswith("var_") or n.startswith("main_")]
            df.drop(to_drop, axis=1, inplace=True)

        return df
Пример #2
0
    def score(self, table):

        prepared_table, __ = prepare_data_table(
            table, score_columns=self.score_columns)
        texp = Experiment(prepared_table)
        score = self.classifier.score(texp, True)
        texp["d_score"] = (score - self.mu) / self.nu

        s_values, q_values = lookup_s_and_q_values_from_error_table(
            texp["d_score"].values, self.error_stat.df)
        texp["m_score"] = q_values
        texp["s_value"] = s_values
        logging.info("mean m_score = %e, std_dev m_score = %e" %
                     (np.mean(q_values), np.std(q_values, ddof=1)))
        logging.info("mean s_value = %e, std_dev s_value = %e" %
                     (np.mean(s_values), np.std(s_values, ddof=1)))
        texp.add_peak_group_rank()

        df = table.join(texp[["d_score", "m_score", "peak_group_rank"]])

        if CONFIG.get("compute.probabilities"):
            df = self.add_probabilities(df, texp)

        if CONFIG.get("target.compress_results"):
            to_drop = [
                n for n in df.columns
                if n.startswith("var_") or n.startswith("main_")
            ]
            df.drop(to_drop, axis=1, inplace=True)

        return df
Пример #3
0
    def learn_and_apply_classifier(self, table):

        prepared_table, score_columns = prepare_data_table(table)

        experiment = Experiment(prepared_table)

        is_test = CONFIG.get("is_test", False)

        if is_test:  # for reliable results
            experiment.df.sort("tg_id", ascending=True, inplace=True)

        experiment.log_summary()

        all_test_target_scores = []
        all_test_decoy_scores = []
        ws = []
        neval = CONFIG.get("xeval.num_iter")
        inst = self.semi_supervised_learner
        num_processes = CONFIG.get("num_processes")
        logging.info("start %d cross evals using %d processes" %
                     (neval, num_processes))
        if num_processes == 1:
            for k in range(neval):
                (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment)
                all_test_target_scores.extend(ttt_scores)
                all_test_decoy_scores.extend(ttd_scores)
                ws.append(w.flatten())
        else:
            pool = multiprocessing.Pool(processes=num_processes)
            while neval:
                remaining = max(0, neval - num_processes)
                todo = neval - remaining
                neval -= todo
                args = ((inst, "learn_randomized", (experiment, )), ) * todo
                res = pool.map(unwrap_self_for_multiprocessing, args)
                top_test_target_scores = [ti for r in res for ti in r[0]]
                top_test_decoy_scores = [ti for r in res for ti in r[1]]
                ws.extend([r[2] for r in res])
                all_test_target_scores.extend(top_test_target_scores)
                all_test_decoy_scores.extend(top_test_decoy_scores)

        logging.info("finished cross evals")
        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        result, data_for_persistence = self.apply_classifier(
            final_classifier, experiment, all_test_target_scores,
            all_test_decoy_scores, table)
        logging.info("calculated scoring and statistics")
        return result, data_for_persistence + (score_columns, )
Пример #4
0
	def apply_loaded_scorer(self, table, loaded_scorer):

		final_classifier, mu, nu, df_raw_stat, num_null, num_total, loaded_score_columns = loaded_scorer

		prepared_table, __ = prepare_data_table(table, loaded_score_columns=loaded_score_columns)

		experiment = Experiment(prepared_table)

		final_score = final_classifier.score(experiment, True)
		experiment["d_score"] = (final_score - mu) / nu

		scored_table = self.enrich_table_with_results(table, experiment, df_raw_stat)

		trained_weights = final_classifier.get_coefs()

		return (None, None, scored_table), None, None, trained_weights
Пример #5
0
    def apply_loaded_scorer(self, table, loaded_scorer):

        final_classifier, mu, nu, df_raw_stat, loaded_score_columns = loaded_scorer

        prepared_table, __ = prepare_data_table(
            table, loaded_score_columns=loaded_score_columns)

        experiment = Experiment(prepared_table)

        final_score = final_classifier.score(experiment, True)
        experiment["d_score"] = (final_score - mu) / nu

        scored_table = self.enrich_table_with_results(table, experiment,
                                                      df_raw_stat)

        return (None, None, scored_table), None
Пример #6
0
	def tutor_and_apply_classifier(self, table, p_score=False, loaded_weights=None):

		prepared_table, score_columns = prepare_data_table(table)

		experiment = Experiment(prepared_table)

		is_test = CONFIG.get("is_test", False)

		if is_test:  # for reliable results
			experiment.df.sort("tg_id", ascending=True, inplace=True)

		experiment.log_summary()

		all_test_target_scores = []
		all_test_decoy_scores = []
		clfs = []
		ws = [] # are ws and clfs redundant?
		
		train_frac	 = CONFIG.get("train.fraction")
		is_test	 = CONFIG.get("is_test", False)
		neval		 = CONFIG.get("xeval.num_iter")
		fraction = CONFIG.get("xeval.fraction")
		teacher	 = self.semi_supervised_teacher # inst
		num_processes = CONFIG.get("num_processes")
		
		# reserve part of experiment for testing and FDR calc.
		experiment.split_train_test(train_frac, is_test)
		train_exp = experiment
		test_exp = None
		if train_frac < 0.99:
			train_exp, test_exp = experiment.get_train_and_test_peaks()
		
		xval_type = CONFIG.get("xval.type")
		if xval_type == "split":
			train_exp.set_xval_sets(neval, is_test)
			xval_sets = xval_sets(neval, int(fraction * neval + 0.5))
				
		if loaded_weights == None:
			logging.info("start %d cross evals using %d processes" % (neval, num_processes))
			if num_processes == 1:
				for k in range(neval):
					if xval_type == "split":
						train_exp.train_on_xval_sets(xval_sets[k])
					else:
						train_exp.split_train_test(fraction, is_test)
					(ttt_scores, ttd_scores, clf) = teacher.tutor_randomized(train_exp)
					all_test_target_scores.extend(ttt_scores)
					all_test_decoy_scores.extend(ttd_scores)
					clfs.append(clf)
			else:
				pool = multiprocessing.Pool(processes=num_processes)
				while neval:
					remaining = max(0, neval - num_processes)
					todo = neval - remaining
					neval -= todo
					args = ((teacher, "tutor_randomized", (train_exp, )), ) * todo
					res = pool.map(unwrap_self_for_multiprocessing, args)
					top_test_target_scores = [ti for r in res for ti in r[0]]
					top_test_decoy_scores = [ti for r in res for ti in r[1]]
					clfs.extend([r[2] for r in res])
					all_test_target_scores.extend(top_test_target_scores)
					all_test_decoy_scores.extend(top_test_decoy_scores)

			logging.info("finished cross evals")
		else:
			logging.info("start application of pretrained weights")
			loaded_clf = LinearPredictor(loaded_weights)
			clfs.append(loaded_clf)
			clf_scores = loaded_clf.score(experiment, True)
			experiment.set_and_rerank("classifier_score", clf_scores)
			all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"])
			all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"])
			logging.info("finished pretrained scoring")
			

		final_classifier = ConsensusPredictor(clfs)
		# TODO: How to solve this for general (non-linear) predictors?
		# ... maybe just ignore for now
		loaded_weights = final_classifier.get_coefs()

		d = {}
		d["tg_id"] = experiment.df.tg_num_id.values
		d["decoy"] = experiment.df.is_decoy.values
		for i in range(len(clfs)):
			s = clfs[i].score(experiment, True)
			experiment.set_and_rerank("classifier_score", s)
			d["clf%d_score" % i] = s.flatten()
			d["clf%d_rank1" % i] = experiment.df.is_top_peak.values

		for c in score_columns:
			d[c] = table[c]

		results, res_dict, data_for_persistence = self.apply_classifier(final_classifier, experiment, test_exp,
															 all_test_target_scores,
															 all_test_decoy_scores, table, p_score=p_score)
		logging.info("calculated scoring and statistics")
		return  results, pd.DataFrame(d), data_for_persistence + (score_columns,), loaded_weights
Пример #7
0
    def learn_and_apply_classifier(self, table, p_score=False, loaded_weights=None):

        prepared_table, score_columns = prepare_data_table(table)

        experiment = Experiment(prepared_table)

        is_test = CONFIG.get("is_test", False)

        if is_test:  # for reliable results
            experiment.df.sort("tg_id", ascending=True, inplace=True)

        experiment.log_summary()

        inst = self.semi_supervised_learner
        ws = []
        neval = CONFIG.get("xeval.num_iter")
        num_processes = CONFIG.get("num_processes")
        all_test_target_scores = []
        all_test_decoy_scores = []

        if loaded_weights == None:
            logging.info("start %d cross evals using %d processes" % (neval, num_processes))
            if num_processes == 1:
                for k in range(neval):
                    (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment)
                    all_test_target_scores.extend(ttt_scores)
                    all_test_decoy_scores.extend(ttd_scores)
                    ws.append(w.flatten())
            else:
                pool = multiprocessing.Pool(processes=num_processes)
                while neval:
                    remaining = max(0, neval - num_processes)
                    todo = neval - remaining
                    neval -= todo
                    args = ((inst, "learn_randomized", (experiment, )), ) * todo
                    res = pool.map(unwrap_self_for_multiprocessing, args)
                    top_test_target_scores = [ti for r in res for ti in r[0]]
                    top_test_decoy_scores = [ti for r in res for ti in r[1]]
                    ws.extend([r[2] for r in res])
                    all_test_target_scores.extend(top_test_target_scores)
                    all_test_decoy_scores.extend(top_test_decoy_scores)
            logging.info("finished cross evals")

        else:
            logging.info("start application of pretrained weights")
            ws.append(loaded_weights.flatten())
            clf_scores = inst.score(experiment, loaded_weights)
            experiment.set_and_rerank("classifier_score", clf_scores)

            all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"])
            all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"])
            logging.info("finished pretrained scoring")

        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        loaded_weights = final_classifier.get_parameters()

        result, data_for_persistence = self.apply_classifier(final_classifier, experiment,
                                                             all_test_target_scores,
                                                             all_test_decoy_scores, table, p_score=p_score)
        logging.info("calculated scoring and statistics")
        return result, data_for_persistence + (score_columns,), loaded_weights