Exemplo n.º 1
0
	def tutor_and_apply_classifier(self, table, p_score=False, loaded_weights=None):

		prepared_table, score_columns = prepare_data_table(table)

		experiment = Experiment(prepared_table)

		is_test = CONFIG.get("is_test", False)

		if is_test:  # for reliable results
			experiment.df.sort("tg_id", ascending=True, inplace=True)

		experiment.log_summary()

		all_test_target_scores = []
		all_test_decoy_scores = []
		clfs = []
		ws = [] # are ws and clfs redundant?
		
		train_frac	 = CONFIG.get("train.fraction")
		is_test	 = CONFIG.get("is_test", False)
		neval		 = CONFIG.get("xeval.num_iter")
		fraction = CONFIG.get("xeval.fraction")
		teacher	 = self.semi_supervised_teacher # inst
		num_processes = CONFIG.get("num_processes")
		
		# reserve part of experiment for testing and FDR calc.
		experiment.split_train_test(train_frac, is_test)
		train_exp = experiment
		test_exp = None
		if train_frac < 0.99:
			train_exp, test_exp = experiment.get_train_and_test_peaks()
		
		xval_type = CONFIG.get("xval.type")
		if xval_type == "split":
			train_exp.set_xval_sets(neval, is_test)
			xval_sets = xval_sets(neval, int(fraction * neval + 0.5))
				
		if loaded_weights == None:
			logging.info("start %d cross evals using %d processes" % (neval, num_processes))
			if num_processes == 1:
				for k in range(neval):
					if xval_type == "split":
						train_exp.train_on_xval_sets(xval_sets[k])
					else:
						train_exp.split_train_test(fraction, is_test)
					(ttt_scores, ttd_scores, clf) = teacher.tutor_randomized(train_exp)
					all_test_target_scores.extend(ttt_scores)
					all_test_decoy_scores.extend(ttd_scores)
					clfs.append(clf)
			else:
				pool = multiprocessing.Pool(processes=num_processes)
				while neval:
					remaining = max(0, neval - num_processes)
					todo = neval - remaining
					neval -= todo
					args = ((teacher, "tutor_randomized", (train_exp, )), ) * todo
					res = pool.map(unwrap_self_for_multiprocessing, args)
					top_test_target_scores = [ti for r in res for ti in r[0]]
					top_test_decoy_scores = [ti for r in res for ti in r[1]]
					clfs.extend([r[2] for r in res])
					all_test_target_scores.extend(top_test_target_scores)
					all_test_decoy_scores.extend(top_test_decoy_scores)

			logging.info("finished cross evals")
		else:
			logging.info("start application of pretrained weights")
			loaded_clf = LinearPredictor(loaded_weights)
			clfs.append(loaded_clf)
			clf_scores = loaded_clf.score(experiment, True)
			experiment.set_and_rerank("classifier_score", clf_scores)
			all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"])
			all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"])
			logging.info("finished pretrained scoring")
			

		final_classifier = ConsensusPredictor(clfs)
		# TODO: How to solve this for general (non-linear) predictors?
		# ... maybe just ignore for now
		loaded_weights = final_classifier.get_coefs()

		d = {}
		d["tg_id"] = experiment.df.tg_num_id.values
		d["decoy"] = experiment.df.is_decoy.values
		for i in range(len(clfs)):
			s = clfs[i].score(experiment, True)
			experiment.set_and_rerank("classifier_score", s)
			d["clf%d_score" % i] = s.flatten()
			d["clf%d_rank1" % i] = experiment.df.is_top_peak.values

		for c in score_columns:
			d[c] = table[c]

		results, res_dict, data_for_persistence = self.apply_classifier(final_classifier, experiment, test_exp,
															 all_test_target_scores,
															 all_test_decoy_scores, table, p_score=p_score)
		logging.info("calculated scoring and statistics")
		return  results, pd.DataFrame(d), data_for_persistence + (score_columns,), loaded_weights
Exemplo n.º 2
0
    def learn_and_apply_classifier(self, table, p_score=False, loaded_weights=None):

        prepared_table, score_columns = prepare_data_table(table)

        experiment = Experiment(prepared_table)

        is_test = CONFIG.get("is_test", False)

        if is_test:  # for reliable results
            experiment.df.sort("tg_id", ascending=True, inplace=True)

        experiment.log_summary()

        inst = self.semi_supervised_learner
        ws = []
        neval = CONFIG.get("xeval.num_iter")
        num_processes = CONFIG.get("num_processes")
        all_test_target_scores = []
        all_test_decoy_scores = []

        if loaded_weights == None:
            logging.info("start %d cross evals using %d processes" % (neval, num_processes))
            if num_processes == 1:
                for k in range(neval):
                    (ttt_scores, ttd_scores, w) = inst.learn_randomized(experiment)
                    all_test_target_scores.extend(ttt_scores)
                    all_test_decoy_scores.extend(ttd_scores)
                    ws.append(w.flatten())
            else:
                pool = multiprocessing.Pool(processes=num_processes)
                while neval:
                    remaining = max(0, neval - num_processes)
                    todo = neval - remaining
                    neval -= todo
                    args = ((inst, "learn_randomized", (experiment, )), ) * todo
                    res = pool.map(unwrap_self_for_multiprocessing, args)
                    top_test_target_scores = [ti for r in res for ti in r[0]]
                    top_test_decoy_scores = [ti for r in res for ti in r[1]]
                    ws.extend([r[2] for r in res])
                    all_test_target_scores.extend(top_test_target_scores)
                    all_test_decoy_scores.extend(top_test_decoy_scores)
            logging.info("finished cross evals")

        else:
            logging.info("start application of pretrained weights")
            ws.append(loaded_weights.flatten())
            clf_scores = inst.score(experiment, loaded_weights)
            experiment.set_and_rerank("classifier_score", clf_scores)

            all_test_target_scores.extend(experiment.get_top_target_peaks()["classifier_score"])
            all_test_decoy_scores.extend(experiment.get_top_decoy_peaks()["classifier_score"])
            logging.info("finished pretrained scoring")

        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        loaded_weights = final_classifier.get_parameters()

        result, data_for_persistence = self.apply_classifier(final_classifier, experiment,
                                                             all_test_target_scores,
                                                             all_test_decoy_scores, table, p_score=p_score)
        logging.info("calculated scoring and statistics")
        return result, data_for_persistence + (score_columns,), loaded_weights