def test_class_with_data_preproc(self): """ Test for simple pipeline with data. """ X, y = load_breast_cancer(return_X_y=True) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( 'basic_svm_pipe', optimizer='grid_search', metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'], best_config_metric='accuracy', eval_final_performance=False, outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=3), verbosity=1, random_seed=42) preprocessing = Preprocessing() preprocessing += PipelineElement("LabelEncoder") my_pipe += preprocessing # ADD ELEMENTS TO YOUR PIPELINE # first normalize all features my_pipe.add(PipelineElement('StandardScaler')) # then do feature selection using a PCA, my_pipe += PipelineElement( 'PCA', hyperparameters={'n_components': IntegerRange(10, 12)}, test_disabled=True) # engage and optimize the good old SVM for Classification my_pipe += PipelineElement( 'SVC', hyperparameters={'kernel': Categorical(['rbf', 'linear'])}, C=2, gamma='scale') # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y) json_transformer = JsonTransformer() pipe_json = json_transformer.create_json(my_pipe) a = elements_to_dict(my_pipe.copy_me()) my_pipe_reload = json_transformer.from_json(pipe_json) pipe_json_reload = pipe_json = json_transformer.create_json( my_pipe_reload) self.assertEqual(pipe_json, pipe_json_reload) my_pipe_reload.fit(X, y) self.assertDictEqual(my_pipe.best_config, my_pipe_reload.best_config) self.assertDictEqual(elements_to_dict(my_pipe.copy_me()), elements_to_dict(my_pipe_reload.copy_me()))
def test_class_with_data_01(self): """ Test for simple pipeline with data. """ X, y = load_breast_cancer(True) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( "basic_svm_pipe", optimizer="grid_search", metrics=["accuracy", "precision", "recall", "balanced_accuracy"], best_config_metric="accuracy", eval_final_performance=False, outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=3), verbosity=1, random_seed=42, ) preprocessing = Preprocessing() preprocessing += PipelineElement("LabelEncoder") my_pipe += preprocessing # ADD ELEMENTS TO YOUR PIPELINE # first normalize all features my_pipe.add(PipelineElement("StandardScaler")) # then do feature selection using a PCA, my_pipe += PipelineElement( "PCA", hyperparameters={"n_components": IntegerRange(10, 12)}, test_disabled=True, ) # engage and optimize the good old SVM for Classification my_pipe += PipelineElement( "SVC", hyperparameters={"kernel": Categorical(["rbf", "linear"])}, C=2, gamma="scale", ) # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y) json_transformer = JsonTransformer() pipe_json = json_transformer.create_json(my_pipe) my_pipe_reload = json_transformer.from_json(pipe_json) pipe_json_reload = pipe_json = json_transformer.create_json( my_pipe_reload) self.assertEqual(pipe_json, pipe_json_reload) my_pipe_reload.fit(X, y) self.assertDictEqual(my_pipe.best_config, my_pipe_reload.best_config)
# WE USE THE BREAST CANCER SET FROM SKLEARN X, y = load_breast_cancer(True) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( "basic_svm_pipe", optimizer="sk_opt", optimizer_params={"n_configurations": 10}, metrics=["accuracy", "precision", "recall", "balanced_accuracy"], best_config_metric="accuracy", outer_cv=KFold(n_splits=3), inner_cv=KFold(n_splits=3), verbosity=1, output_settings=OutputSettings(project_folder="./tmp/"), ) # ADD ELEMENTS TO YOUR PIPELINE my_pipe.add(PipelineElement("StandardScaler")) my_pipe += PipelineElement( "PhotonMLPClassifier", hyperparameters={ "layer_1": IntegerRange(0, 5), "layer_2": IntegerRange(0, 5), "layer_3": IntegerRange(0, 5), }, ) # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y)
from photonai.base import Hyperpipe, PipelineElement, OutputSettings from photonai.optimization import IntegerRange # WE USE THE BREAST CANCER SET FROM SKLEARN X, y = load_breast_cancer(return_X_y=True) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( 'basic_svm_pipe', optimizer='sk_opt', optimizer_params={'n_configurations': 10}, metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'], best_config_metric='accuracy', outer_cv=KFold(n_splits=3), inner_cv=KFold(n_splits=3), verbosity=1, output_settings=OutputSettings(project_folder='./tmp/')) # ADD ELEMENTS TO YOUR PIPELINE my_pipe.add(PipelineElement('StandardScaler')) my_pipe += PipelineElement('PhotonMLPClassifier', hyperparameters={ 'layer_1': IntegerRange(1, 5), 'layer_2': IntegerRange(0, 5), 'layer_3': IntegerRange(0, 5) }) # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y)
class Smac3IntegrationTest(unittest.TestCase): def setUp(self): self.time_limit = 60 * 2 settings = OutputSettings(project_folder="./tmp/") self.smac_helper = {"data": None, "initial_runs": None} # DESIGN YOUR PIPELINE self.pipe = Hyperpipe( "basic_svm_pipe", # the name of your pipeline optimizer="smac", # which optimizer PHOTON shall use optimizer_params={ "wallclock_limit": self.time_limit, "smac_helper": self.smac_helper, "run_limit": 20, }, metrics=["accuracy"], # the performance metrics of your interest best_config_metric="accuracy", inner_cv=KFold( n_splits=3 ), # test each configuration ten times respectively, verbosity=0, output_settings=settings, ) def simple_classification(self): dataset = fetch_olivetti_faces(download_if_missing=True) X = dataset["data"] y = dataset["target"] # self.X, self.y = load_digits(n_class=2, return_X_y=True) return X, y def test_against_smac(self): # PHOTON implementation self.pipe.add(PipelineElement("StandardScaler")) # then do feature selection using a PCA, specify which values to try in the hyperparameter search self.pipe += PipelineElement( "PCA", hyperparameters={"n_components": IntegerRange(5, 30)} ) # engage and optimize the good old SVM for Classification self.pipe += PipelineElement( "SVC", hyperparameters={ "kernel": Categorical(["linear", "rbf", "poly", "sigmoid"]), "C": FloatRange(0.5, 200), }, gamma="auto", ) self.X, self.y = self.simple_classification() self.pipe.fit(self.X, self.y) # AUTO ML direct # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace() # We define a few possible types of SVM-kernels and add them as "kernel" to our cs n_components = UniformIntegerHyperparameter( "PCA__n_components", 5, 30 ) # , default_value=5) cs.add_hyperparameter(n_components) kernel = CategoricalHyperparameter( "SVC__kernel", ["linear", "rbf", "poly", "sigmoid"] ) # , default_value="linear") cs.add_hyperparameter(kernel) c = UniformFloatHyperparameter("SVC__C", 0.5, 200) # , default_value=1) cs.add_hyperparameter(c) # Scenario object scenario = Scenario( { "run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": 800, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true", "shared_model": "false", # !!!! "wallclock_limit": self.time_limit, } ) # Optimize, using a SMAC-object print("Optimizing! Depending on your machine, this might take a few minutes.") smac = SMAC4BO( scenario=scenario, rng=np.random.RandomState(42), tae_runner=self.objective_function, ) self.traurig = smac incumbent = smac.optimize() inc_value = self.objective_function(incumbent) print(incumbent) print(inc_value) runhistory_photon = self.smac_helper["data"].solver.runhistory runhistory_original = smac.solver.runhistory x_ax = range( 1, min( len(runhistory_original.cost_per_config.keys()), len(runhistory_photon.cost_per_config.keys()), ) + 1, ) y_ax_original = [runhistory_original.cost_per_config[tmp] for tmp in x_ax] y_ax_photon = [runhistory_photon.cost_per_config[tmp] for tmp in x_ax] y_ax_original_inc = [min(y_ax_original[: tmp + 1]) for tmp in x_ax] y_ax_photon_inc = [min(y_ax_photon[: tmp + 1]) for tmp in x_ax] plt.figure(figsize=(10, 7)) plt.plot(x_ax, y_ax_original, "g", label="Original") plt.plot(x_ax, y_ax_photon, "b", label="PHOTON") plt.plot(x_ax, y_ax_photon_inc, "r", label="PHOTON Incumbent") plt.plot(x_ax, y_ax_original_inc, "k", label="Original Incumbent") plt.title("Photon Prove") plt.xlabel("X") plt.ylabel("Y") plt.legend(loc="best") plt.show() def neighbours(items, fill=None): before = itertools.chain([fill], items) after = itertools.chain( items, [fill] ) # You could use itertools.zip_longest() later instead. next(after) for a, b, c in zip(before, items, after): yield [value for value in (a, b, c) if value is not fill] print("---------------") original_pairing = [ sum(values) / len(values) for values in neighbours(y_ax_original) ] bias_term = np.mean( [ abs(y_ax_original_inc[t] - y_ax_photon_inc[t]) for t in range(len(y_ax_photon_inc)) ] ) photon_pairing = [ sum(values) / len(values) - bias_term for values in neighbours(y_ax_photon) ] counter = 0 for i, x in enumerate(x_ax): if abs(original_pairing[i] - photon_pairing[i]) > 0.05: counter += 1 self.assertLessEqual(counter / len(x_ax), 0.15) def objective_function(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=3) svc = PipelineElement("SVC", {}, random_state=3, gamma="auto") my_pipe = PhotonPipeline([("StandardScaler", sc), ("PCA", pca), ("SVC", svc)]) my_pipe.set_params(**cfg) metric = cross_val_score( my_pipe, self.X, self.y, cv=3, scoring=make_scorer(accuracy_score, greater_is_better=True), ) # , scoring=my_pipe.predict) print("run") return 1 - np.mean(metric)
class HyperpipeTests(PhotonBaseTest): def setup_hyperpipe(self, output_settings=None): if output_settings is None: output_settings = OutputSettings(project_folder=self.tmp_folder_path) self.hyperpipe = Hyperpipe( "god", inner_cv=self.inner_cv_object, metrics=self.metrics, best_config_metric=self.best_config_metric, output_settings=output_settings, ) self.hyperpipe += self.ss_pipe_element self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) def setUp(self): super(HyperpipeTests, self).setUp() self.ss_pipe_element = PipelineElement("StandardScaler") self.pca_pipe_element = PipelineElement( "PCA", {"n_components": [1, 2]}, random_state=42, test_disabled=True ) self.svc_pipe_element = PipelineElement( "SVC", {"C": [0.1, 1], "kernel": ["linear"]}, # 'rbf', 'sigmoid'] random_state=42, ) self.inner_cv_object = KFold(n_splits=3) self.metrics = ["accuracy", "recall", "precision"] self.best_config_metric = "accuracy" self.setup_hyperpipe() dataset = load_breast_cancer() self.__X = dataset.data self.__y = dataset.target def test_init(self): # test that all init parameters can be retrieved via the cleaned up subclasses self.assertEqual(self.hyperpipe.name, "god") # in case don't give information, check for the default parameters, otherwise for the infos given in setUp # Cross Validation self.assertIsNotNone(self.hyperpipe.cross_validation) self.assertEqual(self.hyperpipe.cross_validation.inner_cv, self.inner_cv_object) self.assertIsNone(self.hyperpipe.cross_validation.outer_cv, None) self.assertTrue(self.hyperpipe.cross_validation.eval_final_performance) self.assertTrue(self.hyperpipe.cross_validation.calculate_metrics_per_fold) self.assertFalse(self.hyperpipe.cross_validation.calculate_metrics_across_folds) self.assertIsNone(self.hyperpipe.cross_validation.outer_folds) self.assertDictEqual(self.hyperpipe.cross_validation.inner_folds, {}) # Optimization self.assertIsNotNone(self.hyperpipe.optimization) self.assertListEqual(self.hyperpipe.optimization.metrics, self.metrics) self.assertEqual( self.hyperpipe.optimization.best_config_metric, self.best_config_metric ) self.assertEqual(self.hyperpipe.optimization.optimizer_input_str, "grid_search") self.assertTrue(self.hyperpipe.optimization.maximize_metric) self.assertIsNone(self.hyperpipe.optimization.performance_constraints) self.assertDictEqual(self.hyperpipe.optimization.optimizer_params, {}) def test_add(self): # assure pipeline has two elements, first the pca and second the svc self.assertEqual(len(self.hyperpipe.elements), 3) self.assertIs(self.hyperpipe.elements[0], self.ss_pipe_element) self.assertIs(self.hyperpipe.elements[1], self.pca_pipe_element) self.assertIs(self.hyperpipe.elements[2], self.svc_pipe_element) # todo : assure that no two elements can be added with the same name # test add method special cases with self.assertRaises(TypeError): self.hyperpipe.add(object()) # assure that preprocessing is identified and set to the extra variable, there is only one preprocessing item my_preproc = Preprocessing() self.hyperpipe.add(my_preproc) self.assertEqual(my_preproc, self.hyperpipe.preprocessing) # make sure the element does not end up in the main pipeline self.assertTrue([item is not my_preproc for item in self.hyperpipe.elements]) def my_func(X, y, **kwargs): return True # test adding callback item my_call_back_item = CallbackElement("test_element", my_func, "predict") self.hyperpipe.add(my_call_back_item) self.assertIs(self.hyperpipe.elements[-1], my_call_back_item) def test_no_metrics(self): # make sure that no metrics means raising an error with self.assertRaises(ValueError): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object) # make sure that if no best config metric is given, PHOTON raises a warning with self.assertRaises(Warning): hyperpipe = Hyperpipe( "hp_name", inner_cv=self.inner_cv_object, metrics=["accuracy", "f1_score"], ) def test_preprocessing(self): prepro_pipe = Preprocessing() prepro_pipe += PipelineElement.create( "dummy", DummyYAndCovariatesTransformer(), {} ) self.hyperpipe += prepro_pipe self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y)) def test_estimation_type(self): def callback(X, y=None, **kwargs): pass pipe = Hyperpipe( "name", inner_cv=KFold(n_splits=2), best_config_metric="mean_squared_error" ) with self.assertRaises(NotImplementedError): pipe += PipelineElement("PCA") est_type = pipe.estimation_type pipe += PipelineElement("SVC") self.assertEqual(pipe.estimation_type, "classifier") pipe.elements[-1] = PipelineElement("SVR") self.assertEqual(pipe.estimation_type, "regressor") with self.assertRaises(NotImplementedError): pipe.elements[-1] = CallbackElement("MyCallback", callback) est_type = pipe.estimation_type def test_copy_me(self): self.maxDiff = None copy = self.hyperpipe.copy_me() copy2 = self.hyperpipe.copy_me() self.assertDictEqual(elements_to_dict(copy), elements_to_dict(self.hyperpipe)) copy_after_fit = self.hyperpipe.fit(self.__X, self.__y).copy_me() copy_after_fit = elements_to_dict(copy_after_fit) # the current_configs of the elements are not None after calling fit() on a hyperpipe # when copying the respective PipelineElement, these current_configs are copied, too # this is why we need to delete _pipe and elements before asserting for equality copy_after_fit["_pipe"] = None copy_after_fit["elements"] = None copy = elements_to_dict(copy) copy["_pipe"] = None copy["elements"] = None self.assertDictEqual(copy, copy_after_fit) # check if deepcopy worked copy2.cross_validation.inner_cv.n_splits = 10 self.assertEqual(copy2.cross_validation.inner_cv.n_splits, 10) self.assertEqual(self.hyperpipe.cross_validation.inner_cv.n_splits, 3) def test_save_optimum_pipe(self): # todo: test .save() of custom model tmp_path = os.path.join(self.tmp_folder_path, "optimum_pipypipe") settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe( "hyperpipe", optimizer="random_grid_search", optimizer_params={"n_configurations": 3}, metrics=["accuracy", "precision", "recall"], best_config_metric="f1_score", outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings, ) preproc = Preprocessing() preproc += PipelineElement("StandardScaler") # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch("tree_branch") tree_qua_branch += PipelineElement("QuantileTransformer") tree_qua_branch += PipelineElement( "DecisionTreeClassifier", {"min_samples_split": IntegerRange(2, 4)}, criterion="gini", ) # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch("svm_branch") svm_mima_branch += PipelineElement("MinMaxScaler") svm_mima_branch += PipelineElement( "SVC", {"kernel": Categorical(["rbf", "linear"]), "C": 2.0}, gamma="auto" ) # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch("neighbour_branch") knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement("KNeighborsClassifier") my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack( "final_stack", [tree_qua_branch, svm_mima_branch, knn_sta_branch] ) my_pipe += PipelineElement("LogisticRegression", solver="lbfgs") my_pipe.fit(self.__X, self.__y) model_path = os.path.join( my_pipe.output_settings.results_folder, "photon_best_model.photon" ) self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join( my_pipe.output_settings.results_folder, "new_test_folder" ) new_model_path = os.path.join(test_folder, "photon_best_model.photon") os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone(loaded_optimum_pipe._meta_information["photon_version"]) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred) def test_overwrite_result_folder(self): """ Test for right handling of parameter output_settings.overwrite. """ def get_summary_file(): return os.path.join( self.hyperpipe.output_settings.results_folder, "photon_summary.txt" ) # Case 1: default output_settings1 = OutputSettings( project_folder=self.tmp_folder_path, save_output=True, overwrite_results=False, ) self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() time.sleep(2) # again with same settings self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() # we expect a new output folder each time with timestamp self.assertNotEqual(tmp_path, tmp_path2) # Case 2 overwrite results: all in the same folder output_settings2 = OutputSettings( project_folder=self.tmp_folder_path, save_output=True, overwrite_results=True, ) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() tmp_date = os.path.getmtime(tmp_path) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() tmp_date2 = os.path.getmtime(tmp_path2) # same folder but summary file is overwritten through the new analysis self.assertEqual(tmp_path, tmp_path2) self.assertNotEqual(tmp_date, tmp_date2) # Case 3: we have a cache folder self.hyperpipe.cache_folder = self.cache_folder_path shutil.rmtree(self.cache_folder_path, ignore_errors=True) self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(os.path.exists(self.cache_folder_path)) def test_random_state(self): self.hyperpipe.random_state = 4567 self.hyperpipe.fit(self.__X, self.__y) # assure we spread the word.. ! self.assertEqual(self.hyperpipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.random_state, 4567) self.assertEqual(self.hyperpipe.optimum_pipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.elements[-1][-1].random_state, 4567) self.assertEqual( self.hyperpipe._pipe.elements[-1][-1].base_element.random_state, 4567 ) def test_dummy_estimator_preparation(self): self.hyperpipe.results = MDBHyperpipe() self.hyperpipe.results.dummy_estimator = dummy_estimator = MDBDummyResults() # one time regressor, one time classifier, one time strange object self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("SVC")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyClassifier)) self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("SVR")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyRegressor)) with self.assertRaises(NotImplementedError): self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("PCA")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertIsNone(dummy_estimator) def setup_crazy_pipe(self): # erase all, we need a complex and crazy task self.hyperpipe.elements = list() nmb_list = list() for i in range(5): nmb = NeuroBranch(name=str(i), nr_of_processes=i + 3) nmb += PipelineElement("SmoothImages") nmb_list.append(nmb) my_switch = Switch("disabling_test_switch") my_switch += nmb_list[0] my_switch += nmb_list[1] my_stack = Stack("stack_of_branches") for i in range(3): my_branch = Branch("branch_" + str(i + 2)) my_branch += PipelineElement("StandardScaler") my_branch += nmb_list[i + 2] my_stack += my_branch self.hyperpipe.add(my_stack) self.hyperpipe.add(PipelineElement("StandardScaler")) self.hyperpipe.add(my_switch) self.hyperpipe.add(PipelineElement("SVC")) return nmb_list def test_recursive_disabling(self): list_of_elements_to_detect = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe(list_of_elements_to_detect) Hyperpipe.disable_multiprocessing_recursively(self.hyperpipe._pipe) self.assertTrue([i.nr_of_processes == 1 for i in list_of_elements_to_detect]) def test_recursive_cache_folder_propagation(self): list_of_elements = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe(self.hyperpipe.elements) self.hyperpipe.recursive_cache_folder_propagation( self.hyperpipe._pipe, self.cache_folder_path, "fold_id_123" ) for i, nmbranch in enumerate(list_of_elements): if i > 1: start_folder = os.path.join( self.cache_folder_path, "branch_" + nmbranch.name ) else: start_folder = self.cache_folder_path expected_folder = os.path.join(start_folder, nmbranch.name) self.assertEqual(nmbranch.base_element.cache_folder, expected_folder) def test_prepare_result_logging(self): # test that results object is given and entails hyperpipe infos self.hyperpipe.data.X = self.__X self.hyperpipe.data.y = self.__y self.hyperpipe._prepare_result_logging(datetime.datetime.now()) self.assertTrue(isinstance(self.hyperpipe.results, MDBHyperpipe)) self.assertTrue(isinstance(self.hyperpipe.results_handler, ResultsHandler)) self.assertTrue(len(self.hyperpipe.results.outer_folds) == 0) def test_finalize_optimization(self): # it is kind of difficult to test that's why we fake it self.hyperpipe.fit(self.__X, self.__y) # reset all infos self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation() self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation() self.hyperpipe.results.metrics_train = {} self.hyperpipe.results.metrics_test = {} self.hyperpipe.best_config = None self.hyperpipe.results.best_config = MDBConfig() self.hyperpipe.optimum_pipe = None # now generate infos again self.hyperpipe._finalize_optimization() expected_num_of_metrics = len(self.hyperpipe.optimization.metrics) # dummy average values self.assertTrue( len(self.hyperpipe.results.dummy_estimator.train), expected_num_of_metrics ) self.assertTrue( len(self.hyperpipe.results.dummy_estimator.test), expected_num_of_metrics ) # overall average values self.assertTrue( len(self.hyperpipe.results.metrics_train), 2 * expected_num_of_metrics ) self.assertTrue( len(self.hyperpipe.results.metrics_test), 2 * expected_num_of_metrics ) # find best config self.assertIsNotNone(self.hyperpipe.best_config) self.assertIsNotNone(self.hyperpipe.results.best_config) self.assertEqual( self.hyperpipe.best_config, self.hyperpipe.results.best_config.config_dict ) # set optimum pipe and params, # todo: test add preprocessing self.assertIsNotNone(self.hyperpipe.optimum_pipe) self.assertEqual( self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C, self.hyperpipe.best_config["SVC__C"], ) # save optimum model self.assertTrue( os.path.isfile( os.path.join( self.hyperpipe.output_settings.results_folder, "photon_best_model.photon", ) ) ) # backmapping # because the pca is test disabled, we expect the number of features self.assertEqual( len(self.hyperpipe.results.best_config_feature_importances[0]), self.__X.shape[1], ) backmapped_feature_importances = os.path.join( self.hyperpipe.output_settings.results_folder, "optimum_pipe_feature_importances_backmapped.csv", ) self.assertTrue(os.path.isfile(backmapped_feature_importances)) loaded_array = np.loadtxt( open(backmapped_feature_importances, "rb"), delimiter="," ) self.assertEqual(loaded_array.shape[0], self.__X.shape[1]) def test_optimum_pipe_predict_and_predict_proba_and_transform(self): # find best config and test against sklearn self.hyperpipe.elements[-1] = PipelineElement( "RandomForestClassifier", {"n_estimators": IntegerRange(4, 20, step=2)}, random_state=42, ) self.hyperpipe.fit(self.__X, self.__y) # the best config is without PCA so we test it best_config_copy = dict(self.hyperpipe.best_config) del best_config_copy["PCA__disabled"] if self.hyperpipe.best_config["PCA__disabled"]: sk_elements = [ ("StandardScaler", StandardScaler()), ("RandomForestClassifier", RandomForestClassifier(random_state=42)), ] else: sk_elements = [ ("StandardScaler", StandardScaler()), ("PCA", PCA(random_state=42)), ("RandomForestClassifier", RandomForestClassifier(random_state=42)), ] self.sklearn_pipe = SKLPipeline(sk_elements) self.sklearn_pipe.set_params(**best_config_copy) self.sklearn_pipe.fit(self.__X, self.__y) self.assertTrue( np.array_equal( self.sklearn_pipe.predict(self.__X), self.hyperpipe.predict(self.__X) ) ) self.assertTrue( np.array_equal( self.sklearn_pipe.predict_proba(self.__X), self.hyperpipe.predict_proba(self.__X), ) ) # fake transform on sklearn pipe step1 = self.sklearn_pipe.named_steps["StandardScaler"].transform(self.__X) if "PCA" in self.sklearn_pipe.named_steps: step2 = self.sklearn_pipe.named_steps["PCA"].transform(self.__X) else: step2 = step1 self.assertTrue(np.array_equal(step2, self.hyperpipe.transform(self.__X)))
class ResultsHandlerTest(PhotonBaseTest): def setUp(self): """ Set default start settings for all tests. """ super(ResultsHandlerTest, self).setUp() self.files = [ 'best_config_predictions.csv', 'time_monitor.csv', 'time_monitor_pie.png', 'photon_result_file.p', 'photon_summary.txt', 'photon_best_model.photon', 'optimum_pipe_feature_importances_backmapped.npz', 'photon_code.py', 'optimizer_history.png' ] self.output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=True) self.ss_pipe_element = PipelineElement('StandardScaler') self.pca_pipe_element = PipelineElement('PCA', {'n_components': [1, 2]}, random_state=42) self.svc_pipe_element = PipelineElement( 'SVC', { 'C': [0.1], 'kernel': ['linear'] }, # 'rbf', 'sigmoid'] random_state=42) self.inner_cv_object = KFold(n_splits=3) self.metrics = ["accuracy", 'recall', 'precision'] self.best_config_metric = "accuracy" self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, metrics=self.metrics, best_config_metric=self.best_config_metric, outer_cv=KFold(n_splits=2), output_settings=self.output_settings, verbosity=1) self.hyperpipe += self.ss_pipe_element self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) dataset = load_breast_cancer() self.__X = dataset.data self.__y = dataset.target self.hyperpipe.fit(self.__X, self.__y) def test_write_convenience_files(self): """ Output creation testing. Only write if output_settings.save_output == True """ for file in self.files: self.assertTrue( os.path.isfile( os.path.join(self.output_settings.results_folder, file))) # correct rows with open( os.path.join(self.output_settings.results_folder, 'best_config_predictions.csv')) as f: self.assertEqual( sum([ outer_fold.number_samples_test for outer_fold in self.hyperpipe.results.outer_folds ]), sum(1 for _ in f) - 1) shutil.rmtree(self.tmp_folder_path, ignore_errors=True) self.output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=False) self.hyperpipe.fit(self.__X, self.__y) self.assertIsNone(self.output_settings.results_folder) def test_readable_time_monitor_csv(self): """ Test for only readable time_moitor.csv (right count of columns and pandas import). """ time_monitor_df = pd.read_csv(os.path.join( self.output_settings.results_folder, 'time_monitor.csv'), header=[0, 1]) self.assertIsInstance(time_monitor_df, pd.DataFrame) self.assertEqual(len(time_monitor_df.columns), 10) def test_summary(self): """ Check content of photon_summary.txt. Adjustment with hyperpipe.result. """ with open( os.path.join(self.output_settings.results_folder, 'photon_summary.txt')) as file: data = file.read() areas = data.split( "-------------------------------------------------------------------" ) # first areas self.assertEqual(areas[0], "\nPHOTON RESULT SUMMARY\n") result_dict = { "dummy_test": self.hyperpipe.results.dummy_estimator.test, "dummy_train": self.hyperpipe.results.dummy_estimator.train, "best_config_train": self.hyperpipe.results.metrics_train, "best_config_test": self.hyperpipe.results.metrics_test } outer_fold_traintest = {} key_areas_outer_fold = [] # all outerfold areas for i in range(len(self.hyperpipe.results.outer_folds)): self.assertEqual(areas[4 + i * 2], '\nOUTER FOLD ' + str(i + 1) + '\n') key_areas_outer_fold.append("outer_fold_" + str(i + 1)) result_dict["outer_fold_"+str(i+1)+"_train"] = \ self.hyperpipe.results.outer_folds[i].best_config.best_config_score.training outer_fold_traintest["outer_fold_" + str(i + 1) + "_train"] = "TrainValue" result_dict["outer_fold_" + str(i + 1) + "_test"] = \ self.hyperpipe.results.outer_folds[i].best_config.best_config_score.validation outer_fold_traintest["outer_fold_" + str(i + 1) + "_test"] = "TestValue" # check performance / test-train of dummy and best_config key_areas = ["entracee", "name", "dummy", "best_config"] splitted_areas = {} for num in range(len(key_areas)): splitted_areas[key_areas[num]] = areas[num].split("\n") index_dict = {} for key in key_areas[2:]: if [perf for perf in splitted_areas[key] if perf == "TEST:"]: index_dict[key + "_test"] = splitted_areas[key].index("TEST:") index_dict[key + "_train"] = splitted_areas[key].index("TRAINING:") else: self.assertTrue(False) for data_key in [k for k in list(result_dict.keys()) if key in k]: table_str = "\n".join([ splitted_areas[key][index_dict[data_key] + i] for i in [2, 4, 5, 6] ]) table = pd.read_csv(StringIO(table_str.replace(" ", "")), sep="|")[["MetricName", "MEAN", "STD"]].set_index("MetricName") for result_metric in result_dict[data_key]: self.assertAlmostEqual( result_metric.value, table[result_metric.operation.split(".")[1]][ result_metric.metric_name], 4) splitted_areas = {} for num in range(len(key_areas_outer_fold)): splitted_areas[key_areas_outer_fold[num]] = areas[len(key_areas) + 1 + num * 2].split("\n") # check all outer_folds for key_area_outer_fold in key_areas_outer_fold: if [ perf for perf in splitted_areas[key_area_outer_fold] if perf == "PERFORMANCE:" ]: index_dict[key_area_outer_fold + "_train"] = splitted_areas[ key_area_outer_fold].index("PERFORMANCE:") index_dict[key_area_outer_fold + "_test"] = index_dict[key_area_outer_fold + "_train"] else: self.assertTrue(False) for data_key in [ k for k in list(result_dict.keys()) if key_area_outer_fold in k ]: table_str = "\n".join([ splitted_areas[key_area_outer_fold][index_dict[data_key] + i] for i in [2, 4, 5, 6] ]) table = pd.read_csv(StringIO(table_str.replace(" ", "")), sep="|")[[ "MetricName", "TrainValue", "TestValue" ]].set_index("MetricName") for result_metric in result_dict[data_key].metrics.keys(): self.assertAlmostEqual( result_dict[data_key].metrics[result_metric], table[outer_fold_traintest[data_key]][result_metric], 4) def test_save_backmapping(self): """ Check dimension of feature backmapping equals input dimensions. """ npzfile = np.load( os.path.join(self.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.npz')) self.assertEqual(len(npzfile.files), 1) result_data = [] for file in npzfile.files: result_data.append(npzfile[file]) self.assertEqual(np.shape(self.__X)[1], result_data[0].size) # def test_save_backmapping_stack(self): # self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, # metrics=self.metrics, # best_config_metric=self.best_config_metric, # outer_cv=KFold(n_splits=2), # output_settings=self.output_settings, # verbosity=1) # self.hyperpipe += self.ss_pipe_element # self.stack = Stack("myStack") # self.stack += PipelineElement("MinMaxScaler") # self.stack += self.pca_pipe_element # self.hyperpipe += self.stack # self.hyperpipe.add(self.svc_pipe_element) # self.output_settings.save_output = True # self.hyperpipe.fit(self.__X, self.__y) # picklefile = pickle.load(open( # os.path.join(self.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.p'),"rb")) # self.assertEqual(np.shape(self.__X)[1], len(picklefile[0])) def pass_through_plots(self): """ Test for plot functions. Only passing test, no quality testing. """ self.assertIsNone(self.hyperpipe.results.plot_optimizer_history()) self.assertIsNone(self.hyperpipe.results.plot_true_pred()) self.assertIsNone(self.hyperpipe.results.plot_confusion_matrix()) self.assertIsNone(self.hyperpipe.results.plot_roc_curve()) def test_load_from_file(self): X, y = load_breast_cancer(True) my_pipe = Hyperpipe( 'load_results_file_test', metrics=['accuracy'], best_config_metric='accuracy', output_settings=OutputSettings(project_folder='./tmp')) my_pipe += PipelineElement("StandardScaler") my_pipe += PipelineElement("SVC") my_pipe.fit(X, y) results_file = os.path.join(my_pipe.output_settings.results_folder, "photon_result_file.p") my_result_handler = ResultsHandler() my_result_handler.load_from_file(results_file) self.assertIsInstance(my_result_handler.results, MDBHyperpipe) def test_get_performance_table(self): pass def test_load_from_mongodb(self): pass
class Smac3IntegrationTest(unittest.TestCase): def setUp(self): self.s_split = ShuffleSplit(n_splits=3, test_size=0.2, random_state=42) self.time_limit = 20 settings = OutputSettings(project_folder='./tmp/') self.smac_helper = {"data": None, "initial_runs": None} # Scenario object scenario_dict = { "run_obj": "quality", "deterministic": "true", "wallclock_limit": self.time_limit } # DESIGN YOUR PIPELINE self.pipe = Hyperpipe('basic_svm_pipe', optimizer='smac', optimizer_params={ 'facade': SMAC4HPO, 'scenario_dict': scenario_dict, 'rng': 42, 'smac_helper': self.smac_helper }, metrics=['accuracy'], random_seed=42, best_config_metric='accuracy', inner_cv=self.s_split, verbosity=0, output_settings=settings) def simple_classification(self): dataset = fetch_olivetti_faces(download_if_missing=True) self.X = dataset["data"] self.y = dataset["target"] return self.X, self.y # integration test for simple pipeline without Switch def test_photon_implementation_simple(self): # PHOTON implementation self.pipe.add(PipelineElement('StandardScaler')) self.pipe += PipelineElement( 'PCA', hyperparameters={'n_components': IntegerRange(5, 30)}) self.pipe += PipelineElement('SVC', hyperparameters={ 'kernel': Categorical(["rbf", 'poly']), 'C': FloatRange(0.5, 200) }, gamma='auto') self.X, self.y = self.simple_classification() self.pipe.fit(self.X, self.y) # direct AUTO ML implementation # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace() n_components = UniformIntegerHyperparameter( "PCA__n_components", 5, 30) cs.add_hyperparameter(n_components) kernel = CategoricalHyperparameter("SVC__kernel", ["rbf", 'poly']) cs.add_hyperparameter(kernel) c = UniformFloatHyperparameter("SVC__C", 0.5, 200) cs.add_hyperparameter(c) # Scenario object scenario = Scenario({ "run_obj": "quality", "cs": cs, "deterministic": "true", "wallclock_limit": self.time_limit, "limit_resources": False, 'abort_on_first_run_crash': False }) # Optimize, using a SMAC directly smac = SMAC4HPO(scenario=scenario, rng=42, tae_runner=self.objective_function_simple) _ = smac.optimize() runhistory_photon = self.smac_helper["data"].solver.runhistory runhistory_original = smac.solver.runhistory x_ax = range( 1, min(len(runhistory_original._cost_per_config.keys()), len(runhistory_photon._cost_per_config.keys())) + 1) y_ax_original = [ runhistory_original._cost_per_config[tmp] for tmp in x_ax ] y_ax_photon = [ runhistory_photon._cost_per_config[tmp] for tmp in x_ax ] y_ax_original_inc = [min(y_ax_original[:tmp + 1]) for tmp in x_ax] y_ax_photon_inc = [min(y_ax_photon[:tmp + 1]) for tmp in x_ax] plot = False if plot: plt.figure(figsize=(10, 7)) plt.plot(x_ax, y_ax_original, 'g', label='Original') plt.plot(x_ax, y_ax_photon, 'b', label='PHOTON') plt.plot(x_ax, y_ax_photon_inc, 'r', label='PHOTON Incumbent') plt.plot(x_ax, y_ax_original_inc, 'k', label='Original Incumbent') plt.title('Photon Prove') plt.xlabel('X') plt.ylabel('Y') plt.legend(loc='best') plt.savefig("smac.png") min_len = min(len(y_ax_original), len(y_ax_photon)) self.assertLessEqual( np.max( np.abs( np.array(y_ax_original[:min_len]) - np.array(y_ax_photon[:min_len]))), 0.01) def objective_function_simple(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) svc = PipelineElement("SVC", {}, random_state=42, gamma='auto') my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), ('SVC', svc)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values) # integration test for pipeline with Switch def test_photon_implementation_switch(self): # PHOTON implementation self.pipe.add(PipelineElement('StandardScaler')) self.pipe += PipelineElement( 'PCA', hyperparameters={'n_components': IntegerRange(5, 30)}) estimator_siwtch = Switch("Estimator") estimator_siwtch += PipelineElement('SVC', hyperparameters={ 'kernel': Categorical( ["rbf", 'poly']), 'C': FloatRange(0.5, 200) }, gamma='auto') estimator_siwtch += PipelineElement('RandomForestClassifier', hyperparameters={ 'criterion': Categorical( ['gini', 'entropy']), 'min_samples_split': IntegerRange(2, 4) }) self.pipe += estimator_siwtch self.X, self.y = self.simple_classification() self.pipe.fit(self.X, self.y) # direct AUTO ML implementation # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace() n_components = UniformIntegerHyperparameter( "PCA__n_components", 5, 30) cs.add_hyperparameter(n_components) switch = CategoricalHyperparameter("Estimator_switch", ['svc', 'rf']) cs.add_hyperparameter(switch) kernel = CategoricalHyperparameter("SVC__kernel", ["rbf", 'poly']) cs.add_hyperparameter(kernel) c = UniformFloatHyperparameter("SVC__C", 0.5, 200) cs.add_hyperparameter(c) use_svc_c = InCondition(child=kernel, parent=switch, values=["svc"]) use_svc_kernel = InCondition(child=c, parent=switch, values=["svc"]) criterion = CategoricalHyperparameter( "RandomForestClassifier__criterion", ['gini', 'entropy']) cs.add_hyperparameter(criterion) minsplit = UniformIntegerHyperparameter( "RandomForestClassifier__min_samples_split", 2, 4) cs.add_hyperparameter(minsplit) use_rf_crit = InCondition(child=criterion, parent=switch, values=["rf"]) use_rf_minsplit = InCondition(child=minsplit, parent=switch, values=["rf"]) cs.add_conditions( [use_svc_c, use_svc_kernel, use_rf_crit, use_rf_minsplit]) # Scenario object scenario = Scenario({ "run_obj": "quality", "cs": cs, "deterministic": "true", "wallclock_limit": self.time_limit, "limit_resources": False, 'abort_on_first_run_crash': False }) # Optimize, using a SMAC directly smac = SMAC4HPO(scenario=scenario, rng=42, tae_runner=self.objective_function_switch) _ = smac.optimize() runhistory_photon = self.smac_helper["data"].solver.runhistory runhistory_original = smac.solver.runhistory x_ax = range( 1, min(len(runhistory_original._cost_per_config.keys()), len(runhistory_photon._cost_per_config.keys())) + 1) y_ax_original = [ runhistory_original._cost_per_config[tmp] for tmp in x_ax ] y_ax_photon = [ runhistory_photon._cost_per_config[tmp] for tmp in x_ax ] min_len = min(len(y_ax_original), len(y_ax_photon)) self.assertLessEqual( np.max( np.abs( np.array(y_ax_original[:min_len]) - np.array(y_ax_photon[:min_len]))), 0.01) def objective_function_switch(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) switch = cfg["Estimator_switch"] del cfg["Estimator_switch"] for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) if switch == 'svc': est = PipelineElement("SVC", {}, random_state=42, gamma='auto') name = 'SVC' else: est = PipelineElement("RandomForestClassifier", {}, random_state=42) name = "RandomForestClassifier" my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), (name, est)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values) def test_facade(self): config_space = ConfigurationSpace() n_components = UniformIntegerHyperparameter( "PCA__n_components", 5, 30) config_space.add_hyperparameter(n_components) scenario_dict = { "run_obj": "quality", "deterministic": "true", "cs": config_space, "wallclock_limit": 60 } with self.assertRaises(ValueError): SMACOptimizer(facade="SMAC4BOO", scenario_dict=scenario_dict) with self.assertRaises(ValueError): facade = SMAC4BO(scenario=Scenario(scenario_dict)) SMACOptimizer(facade=facade, scenario_dict=scenario_dict) facades = [ "SMAC4BO", SMAC4BO, "SMAC4AC", SMAC4AC, "SMAC4HPO", SMAC4HPO, "BOHB4HPO", BOHB4HPO ] for facade in facades: SMACOptimizer(facade=facade, scenario_dict=scenario_dict)
class HyperpipeTests(PhotonBaseTest): def setup_hyperpipe(self, output_settings=None): if output_settings is None: output_settings = OutputSettings( project_folder=self.tmp_folder_path) self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, metrics=self.metrics, best_config_metric=self.best_config_metric, output_settings=output_settings, verbosity=2) self.hyperpipe += self.ss_pipe_element self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) @classmethod def setUpClass(cls) -> None: cls.file = __file__ super(HyperpipeTests, cls).setUpClass() def setUp(self): super(HyperpipeTests, self).setUp() self.ss_pipe_element = PipelineElement('StandardScaler') self.pca_pipe_element = PipelineElement('PCA', {'n_components': [1, 2]}, random_state=42, test_disabled=True) self.svc_pipe_element = PipelineElement( 'SVC', { 'C': [0.1, 1], 'kernel': ['linear'] }, # 'rbf', 'sigmoid'] random_state=42) self.inner_cv_object = KFold(n_splits=3) self.metrics = ["accuracy", 'recall', 'precision'] self.best_config_metric = "accuracy" self.setup_hyperpipe() dataset = load_breast_cancer() self.__X = dataset.data self.__y = dataset.target def test_init(self): # test that all init parameters can be retrieved via the cleaned up subclasses self.assertEqual(self.hyperpipe.name, 'god') # in case don't give information, check for the default parameters, otherwise for the infos given in setUp # Cross Validation self.assertIsNotNone(self.hyperpipe.cross_validation) self.assertEqual(self.hyperpipe.cross_validation.inner_cv, self.inner_cv_object) self.assertIsNone(self.hyperpipe.cross_validation.outer_cv, None) self.assertTrue(self.hyperpipe.cross_validation.eval_final_performance) self.assertTrue( self.hyperpipe.cross_validation.calculate_metrics_per_fold) self.assertFalse( self.hyperpipe.cross_validation.calculate_metrics_across_folds) self.assertIsNone(self.hyperpipe.cross_validation.outer_folds) self.assertDictEqual(self.hyperpipe.cross_validation.inner_folds, {}) # Optimization self.assertIsNotNone(self.hyperpipe.optimization) self.assertListEqual(self.hyperpipe.optimization.metrics, self.metrics) self.assertEqual(self.hyperpipe.optimization.best_config_metric, self.best_config_metric) self.assertEqual(self.hyperpipe.optimization.optimizer_input_str, "grid_search") self.assertTrue(self.hyperpipe.optimization.maximize_metric) self.assertIsNone(self.hyperpipe.optimization.performance_constraints) self.assertDictEqual(self.hyperpipe.optimization.optimizer_params, {}) def test_add(self): # assure pipeline has two elements, first the pca and second the svc self.assertEqual(len(self.hyperpipe.elements), 3) self.assertIs(self.hyperpipe.elements[0], self.ss_pipe_element) self.assertIs(self.hyperpipe.elements[1], self.pca_pipe_element) self.assertIs(self.hyperpipe.elements[2], self.svc_pipe_element) # todo : assure that no two elements can be added with the same name # test add method special cases with self.assertRaises(TypeError): self.hyperpipe.add(object()) # assure that preprocessing is identified and set to the extra variable, there is only one preprocessing item my_preproc = Preprocessing() self.hyperpipe.add(my_preproc) self.assertEqual(my_preproc, self.hyperpipe.preprocessing) # make sure the element does not end up in the main pipeline self.assertTrue( [item is not my_preproc for item in self.hyperpipe.elements]) def my_func(X, y, **kwargs): return True # test adding callback item my_call_back_item = CallbackElement('test_element', my_func, 'predict') self.hyperpipe.add(my_call_back_item) self.assertIs(self.hyperpipe.elements[-1], my_call_back_item) def test_sanity(self): # make sure that no metrics means raising an error with self.assertRaises(ValueError): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object) # make sure that if no best config metric is given, PHOTON raises a warning with self.assertRaises(Warning): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object, metrics=["accuracy", "f1_score"]) with self.assertRaises(Warning): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object, best_config_metric=["accuracy", "f1_score"]) with self.assertRaises(NotImplementedError): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object, best_config_metric='accuracy', metrics=["accuracy"], calculate_metrics_across_folds=False, calculate_metrics_per_fold=False) with self.assertRaises(AttributeError): hyperpipe = Hyperpipe("hp_name", best_config_metric='accuracy', metrics=["accuracy"]) data = np.random.random((500, 50)) with self.assertRaises(ValueError): targets = np.random.randint(0, 1, (500, 2)) self.hyperpipe.fit(data, targets) def test_hyperpipe_with_custom_metric(self): def custom_metric(y_true, y_pred): return 99.9 self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, metrics=[('custom_metric', custom_metric), 'accuracy'], best_config_metric=Accuracy, output_settings=OutputSettings( project_folder=self.tmp_folder_path)) self.hyperpipe += self.ss_pipe_element self.hyperpipe.add(self.svc_pipe_element) self.hyperpipe.fit(self.__X, self.__y) self.assertTrue('custom_metric' in self.hyperpipe.results.best_config. best_config_score.validation.metrics) self.assertEqual( self.hyperpipe.results.best_config.best_config_score.validation. metrics['custom_metric'], 99.9) expected_num_of_metrics = len(self.hyperpipe.optimization.metrics) # one: accuracy, two: custom metric registered as "custom_metric", three: keras Metric registered as function self.assertEqual(expected_num_of_metrics, 3) # dummy average values self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train), expected_num_of_metrics) self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test), expected_num_of_metrics) # overall average values self.assertTrue(len(self.hyperpipe.results.metrics_train), 2 * expected_num_of_metrics) self.assertTrue(len(self.hyperpipe.results.metrics_test), 2 * expected_num_of_metrics) def test_preprocessing(self): prepro_pipe = Preprocessing() prepro_pipe += PipelineElement.create("dummy", DummyYAndCovariatesTransformer(), {}) self.hyperpipe += prepro_pipe self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y)) def test_estimation_type(self): def callback(X, y=None, **kwargs): pass pipe = Hyperpipe('name', inner_cv=KFold(n_splits=2), best_config_metric='mean_squared_error') with self.assertRaises(NotImplementedError): pipe += PipelineElement('PCA') est_type = pipe.estimation_type pipe += PipelineElement('SVC') self.assertEqual(pipe.estimation_type, 'classifier') pipe.elements[-1] = PipelineElement('SVR') self.assertEqual(pipe.estimation_type, 'regressor') with self.assertRaises(NotImplementedError): pipe.elements[-1] = CallbackElement('MyCallback', callback) est_type = pipe.estimation_type def test_copy_me(self): self.maxDiff = None copy = self.hyperpipe.copy_me() copy2 = self.hyperpipe.copy_me() self.assertDictEqual(elements_to_dict(copy), elements_to_dict(self.hyperpipe)) copy_after_fit = self.hyperpipe.fit(self.__X, self.__y).copy_me() copy_after_fit = elements_to_dict(copy_after_fit) # the current_configs of the elements are not None after calling fit() on a hyperpipe # when copying the respective PipelineElement, these current_configs are copied, too # this is why we need to delete _pipe and elements before asserting for equality copy_after_fit['_pipe'] = None copy_after_fit['elements'] = None copy = elements_to_dict(copy) copy['_pipe'] = None copy['elements'] = None self.assertDictEqual(copy, copy_after_fit) # check if deepcopy worked copy2.cross_validation.inner_cv.n_splits = 10 self.assertEqual(copy2.cross_validation.inner_cv.n_splits, 10) self.assertEqual(self.hyperpipe.cross_validation.inner_cv.n_splits, 3) def test_save_optimum_pipe(self): tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 3}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) preproc = Preprocessing() preproc += PipelineElement('StandardScaler') # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch('tree_branch') tree_qua_branch += PipelineElement('QuantileTransformer') tree_qua_branch += PipelineElement( 'DecisionTreeClassifier', {'min_samples_split': IntegerRange(2, 4)}, criterion='gini') # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch('svm_branch') svm_mima_branch += PipelineElement('MinMaxScaler') svm_mima_branch += PipelineElement( 'SVC', { 'kernel': Categorical(['rbf', 'linear']), 'C': 2.0 }, gamma='auto') # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch('neighbour_branch') knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement('KNeighborsClassifier') my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack('final_stack', [tree_qua_branch, svm_mima_branch, knn_sta_branch]) my_pipe += PipelineElement('LogisticRegression', solver='lbfgs') my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model.photon') self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join(my_pipe.output_settings.results_folder, 'new_test_folder') new_model_path = os.path.join(test_folder, 'photon_best_model.photon') os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone( loaded_optimum_pipe._meta_information['photon_version']) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred) def test_save_optimum_pipe_custom_element(self): tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 1}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) my_pipe += PipelineElement('KerasDnnClassifier', {}, epochs=1, hidden_layer_sizes=[5]) my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model.photon') self.assertTrue(os.path.exists(model_path)) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) def test_failure_to_save_optimum_pipe(self): tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 1}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) my_pipe += PipelineElement('KNeighborsClassifier') my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model_wrong_path.photon') with self.assertRaises(FileNotFoundError): Hyperpipe.load_optimum_pipe(model_path) def test_overwrite_result_folder(self): """ Test for right handling of parameter output_settings.overwrite. """ def get_summary_file(): return os.path.join(self.hyperpipe.output_settings.results_folder, 'photon_summary.txt') # Case 1: default output_settings1 = OutputSettings(project_folder=self.tmp_folder_path, save_output=True, overwrite_results=False) self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() time.sleep(2) # again with same settings self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() # we expect a new output folder each time with timestamp self.assertNotEqual(tmp_path, tmp_path2) # Case 2 overwrite results: all in the same folder output_settings2 = OutputSettings(project_folder=self.tmp_folder_path, save_output=True, overwrite_results=True) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() tmp_date = os.path.getmtime(tmp_path) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() tmp_date2 = os.path.getmtime(tmp_path2) # same folder but summary file is overwritten through the new analysis self.assertEqual(tmp_path, tmp_path2) self.assertNotEqual(tmp_date, tmp_date2) # Case 3: we have a cache folder self.hyperpipe.cache_folder = self.cache_folder_path shutil.rmtree(self.cache_folder_path, ignore_errors=True) self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(os.path.exists(self.cache_folder_path)) def test_random_state(self): self.hyperpipe.random_state = 4567 self.hyperpipe.fit(self.__X, self.__y) # assure we spread the word.. ! self.assertEqual(self.hyperpipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.random_state, 4567) self.assertEqual(self.hyperpipe.optimum_pipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.elements[-1][-1].random_state, 4567) self.assertEqual( self.hyperpipe._pipe.elements[-1][-1].base_element.random_state, 4567) def test_dummy_estimator_preparation(self): self.hyperpipe.results = MDBHyperpipe() self.hyperpipe.results.dummy_estimator = dummy_estimator = MDBDummyResults( ) # one time regressor, one time classifier, one time strange object self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement('SVC')) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyClassifier)) self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement('SVR')) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyRegressor)) with self.assertRaises(NotImplementedError): self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement('PCA')) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertIsNone(dummy_estimator) def setup_crazy_pipe(self): # erase all, we need a complex and crazy task self.hyperpipe.elements = list() nmb_list = list() for i in range(5): nmb = ParallelBranch(name=str(i), nr_of_processes=i + 3) sp = PipelineElement( 'PCA', hyperparameters={'n_components': IntegerRange(1, 50)}) nmb += sp nmb_list.append(nmb) my_switch = Switch('disabling_test_switch') my_switch += nmb_list[0] my_switch += nmb_list[1] my_stack = Stack('stack_of_branches') for i in range(3): my_branch = Branch('branch_' + str(i + 2)) my_branch += PipelineElement('StandardScaler') my_branch += nmb_list[i + 2] my_stack += my_branch self.hyperpipe.add(my_stack) self.hyperpipe.add(PipelineElement('StandardScaler')) self.hyperpipe.add(my_switch) self.hyperpipe.add(PipelineElement('SVC')) return nmb_list def test_recursive_disabling(self): list_of_elements_to_detect = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe( list_of_elements_to_detect) Hyperpipe.disable_multiprocessing_recursively(self.hyperpipe._pipe) self.assertTrue( [i.nr_of_processes == 1 for i in list_of_elements_to_detect]) def test_recursive_cache_folder_propagation(self): list_of_elements = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe( self.hyperpipe.elements) self.hyperpipe.recursive_cache_folder_propagation( self.hyperpipe._pipe, self.cache_folder_path, 'fold_id_123') for i, nmbranch in enumerate(list_of_elements): if i > 1: start_folder = os.path.join(self.cache_folder_path, 'branch_' + nmbranch.name) else: start_folder = self.cache_folder_path expected_folder = os.path.join(start_folder, nmbranch.name) self.assertEqual(nmbranch.base_element.cache_folder, expected_folder) def test_prepare_result_logging(self): # test that results object is given and entails hyperpipe infos self.hyperpipe.data.X = self.__X self.hyperpipe.data.y = self.__y self.hyperpipe._prepare_result_logging(datetime.datetime.now()) self.assertTrue(isinstance(self.hyperpipe.results, MDBHyperpipe)) self.assertTrue( isinstance(self.hyperpipe.results_handler, ResultsHandler)) self.assertTrue(len(self.hyperpipe.results.outer_folds) == 0) def test_finalize_optimization(self): # it is kind of difficult to test that's why we fake it self.hyperpipe.fit(self.__X, self.__y) # reset all infos self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation() self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation() self.hyperpipe.results.metrics_train = {} self.hyperpipe.results.metrics_test = {} self.hyperpipe.best_config = None self.hyperpipe.results.best_config = MDBConfig() self.hyperpipe.optimum_pipe = None # now generate infos again self.hyperpipe._finalize_optimization() expected_num_of_metrics = len(self.hyperpipe.optimization.metrics) # dummy average values self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train), expected_num_of_metrics) self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test), expected_num_of_metrics) # overall average values self.assertTrue(len(self.hyperpipe.results.metrics_train), 2 * expected_num_of_metrics) self.assertTrue(len(self.hyperpipe.results.metrics_test), 2 * expected_num_of_metrics) # find best config self.assertIsNotNone(self.hyperpipe.best_config) self.assertIsNotNone(self.hyperpipe.results.best_config) self.assertEqual(self.hyperpipe.best_config, self.hyperpipe.results.best_config.config_dict) # set optimum pipe and params, # todo: test add preprocessing self.assertIsNotNone(self.hyperpipe.optimum_pipe) self.assertEqual( self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C, self.hyperpipe.best_config["SVC__C"]) # save optimum model self.assertTrue( os.path.isfile( os.path.join(self.hyperpipe.output_settings.results_folder, 'photon_best_model.photon'))) # backmapping # because the pca is test disabled, we expect the number of features self.assertEqual( len(self.hyperpipe.results.best_config_feature_importances[0]), self.__X.shape[1]) backmapped_feature_importances = os.path.join( self.hyperpipe.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.csv') self.assertTrue(os.path.isfile(backmapped_feature_importances)) loaded_array = np.loadtxt(open(backmapped_feature_importances, 'rb'), delimiter=",") self.assertEqual(loaded_array.shape[0], self.__X.shape[1]) def test_optimum_pipe_predict_and_predict_proba_and_transform(self): # find best config and test against sklearn self.hyperpipe.elements[-1] = PipelineElement( 'RandomForestClassifier', {'n_estimators': IntegerRange(4, 20, step=2)}, random_state=42) self.hyperpipe.fit(self.__X, self.__y) # the best config is without PCA so we test it best_config_copy = dict(self.hyperpipe.best_config) del best_config_copy["PCA__disabled"] if self.hyperpipe.best_config["PCA__disabled"]: sk_elements = [('StandardScaler', StandardScaler()), ('RandomForestClassifier', RandomForestClassifier(random_state=42))] else: sk_elements = [('StandardScaler', StandardScaler()), ('PCA', PCA(random_state=42)), ('RandomForestClassifier', RandomForestClassifier(random_state=42))] self.sklearn_pipe = SKLPipeline(sk_elements) self.sklearn_pipe.set_params(**best_config_copy) self.sklearn_pipe.fit(self.__X, self.__y) self.assertTrue( np.array_equal(self.sklearn_pipe.predict(self.__X), self.hyperpipe.predict(self.__X))) self.assertTrue( np.array_equal(self.sklearn_pipe.predict_proba(self.__X), self.hyperpipe.predict_proba(self.__X))) # fake transform on sklearn pipe step1 = self.sklearn_pipe.named_steps["StandardScaler"].transform( self.__X) if "PCA" in self.sklearn_pipe.named_steps: step2 = self.sklearn_pipe.named_steps["PCA"].transform(self.__X) else: step2 = step1 self.assertTrue(np.allclose(step2, self.hyperpipe.transform(self.__X)))