def testMaxModelsOnDisc2(ensemble_backend): # Test for Extreme scenarios # Make sure that the best predictions are kept ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=50, max_models_on_disc=10000.0, ) ensbuilder.read_preds = {} for i in range(50): ensbuilder.read_scores['pred' + str(i)] = { 'ens_score': i * 10, 'num_run': i, 'loaded': 1, "seed": 1, "disc_space_cost_mb": 50 * i, } ensbuilder.read_preds['pred' + str(i)] = {Y_ENSEMBLE: True} sel_keys = ensbuilder.get_n_best_preds() assert ['pred49', 'pred48', 'pred47'] == sel_keys # Make sure at least one model is kept alive ensbuilder.max_models_on_disc = 0.0 sel_keys = ensbuilder.get_n_best_preds() assert ['pred49'] == sel_keys
def testPerformanceRangeThresholdMaxBest(self): to_test = ((0.0, 1, 1), (0.0, 1.0, 4), (0.1, 2, 2), (0.3, 4, 3), (0.5, 1, 1), (0.6, 10, 2), (0.8, 0.5, 1), (1, 1.0, 1)) for performance_range_threshold, ensemble_nbest, exp in to_test: ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, performance_range_threshold=performance_range_threshold, max_models_on_disc=None, ) ensbuilder.read_preds = { 'A': {'ens_score': 1, 'num_run': 1, 0: True, 'loaded': -1, "seed": 1}, 'B': {'ens_score': 2, 'num_run': 2, 0: True, 'loaded': -1, "seed": 1}, 'C': {'ens_score': 3, 'num_run': 3, 0: True, 'loaded': -1, "seed": 1}, 'D': {'ens_score': 4, 'num_run': 4, 0: True, 'loaded': -1, "seed": 1}, 'E': {'ens_score': 5, 'num_run': 5, 0: True, 'loaded': -1, "seed": 1}, } sel_keys = ensbuilder.get_n_best_preds() self.assertEqual(len(sel_keys), exp)
def testFallBackNBest(self): ensbuilder = EnsembleBuilder(backend=self.backend, dataset_name="TEST", task_type=1, #Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=1 ) ensbuilder.read_ensemble_preds() filename = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy" ) ensbuilder.read_preds[filename]["ens_score"] = -1 filename = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) ensbuilder.read_preds[filename]["ens_score"] = -1 sel_keys = ensbuilder.get_n_best_preds() fixture = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) self.assertEquals(sel_keys[0], fixture)
def testGetValidTestPreds(self): ensbuilder = EnsembleBuilder(backend=self.backend, dataset_name="TEST", task_type=1, #Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=1 ) ensbuilder.read_ensemble_preds() d2 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy" ) d1 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) sel_keys = ensbuilder.get_n_best_preds() ensbuilder.get_valid_test_preds(selected_keys=sel_keys) # selected --> read valid and test predictions self.assertIsNotNone(ensbuilder.read_preds[d2][Y_VALID]) self.assertIsNotNone(ensbuilder.read_preds[d2][Y_TEST]) # not selected --> should still be None self.assertIsNone(ensbuilder.read_preds[d1][Y_VALID]) self.assertIsNone(ensbuilder.read_preds[d1][Y_TEST])
def testFallBackNBest(self): ensbuilder = EnsembleBuilder(backend=self.backend, dataset_name="TEST", task_type=1, #Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=1 ) ensbuilder.read_ensemble_preds() filename = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy" ) ensbuilder.read_preds[filename]["ens_score"] = -1 filename = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) ensbuilder.read_preds[filename]["ens_score"] = -1 sel_keys = ensbuilder.get_n_best_preds() fixture = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) self.assertEquals(sel_keys[0], fixture)
def testNBest(self): for ensemble_nbest, models_on_disc, exp in ( (1, None, 1), (1.0, None, 2), (0.1, None, 1), (0.9, None, 1), (1, 2, 1), (2, 1, 1), ): ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, max_models_on_disc=models_on_disc, ) ensbuilder.score_ensemble_preds() sel_keys = ensbuilder.get_n_best_preds() self.assertEqual(len(sel_keys), exp) fixture = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2_0.0.npy" ) self.assertEqual(sel_keys[0], fixture)
def testGetValidTestPreds(self): ensbuilder = EnsembleBuilder(backend=self.backend, dataset_name="TEST", task_type=1, #Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=1 ) ensbuilder.read_ensemble_preds() d2 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy" ) d1 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1.npy" ) sel_keys = ensbuilder.get_n_best_preds() ensbuilder.get_valid_test_preds(selected_keys=sel_keys) # selected --> read valid and test predictions self.assertIsNotNone(ensbuilder.read_preds[d2][Y_VALID]) self.assertIsNotNone(ensbuilder.read_preds[d2][Y_TEST]) # not selected --> should still be None self.assertIsNone(ensbuilder.read_preds[d1][Y_VALID]) self.assertIsNone(ensbuilder.read_preds[d1][Y_TEST])
def testEntireEnsembleBuilder(self): ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=2, ) ensbuilder.SAVE2DISC = False ensbuilder.score_ensemble_preds() d2 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2_0.0.npy" ) sel_keys = ensbuilder.get_n_best_preds() self.assertGreater(len(sel_keys), 0) ensemble = ensbuilder.fit_ensemble(selected_keys=sel_keys) print(ensemble, sel_keys) n_sel_valid, n_sel_test = ensbuilder.get_valid_test_preds( selected_keys=sel_keys) # both valid and test prediction files are available self.assertGreater(len(n_sel_valid), 0) self.assertEqual(n_sel_valid, n_sel_test) y_valid = ensbuilder.predict( set_="valid", ensemble=ensemble, selected_keys=n_sel_valid, n_preds=len(sel_keys), index_run=1, ) y_test = ensbuilder.predict( set_="test", ensemble=ensemble, selected_keys=n_sel_test, n_preds=len(sel_keys), index_run=1, ) # predictions for valid and test are the same # --> should results in the same predictions np.testing.assert_array_almost_equal(y_valid, y_test) # since d2 provides perfect predictions # it should get a higher weight # so that y_valid should be exactly y_valid_d2 y_valid_d2 = ensbuilder.read_preds[d2][Y_VALID][:, 1] np.testing.assert_array_almost_equal(y_valid, y_valid_d2)
def testEntireEnsembleBuilder(self): ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, #Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=2, ) ensbuilder.SAVE2DISC = False ensbuilder.read_ensemble_preds() d2 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2.npy" ) sel_keys = ensbuilder.get_n_best_preds() self.assertGreater(len(sel_keys), 0) ensemble = ensbuilder.fit_ensemble(selected_keys=sel_keys) print(ensemble, sel_keys) n_sel_valid, n_sel_test = ensbuilder.get_valid_test_preds(selected_keys=sel_keys) # both valid and test prediction files are available self.assertGreater(len(n_sel_valid), 0) self.assertEqual(n_sel_valid, n_sel_test) y_valid = ensbuilder.predict( set_="valid", ensemble=ensemble, selected_keys=n_sel_valid, n_preds=len(sel_keys), index_run=1, ) y_test = ensbuilder.predict( set_="test", ensemble=ensemble, selected_keys=n_sel_test, n_preds=len(sel_keys), index_run=1, ) # predictions for valid and test are the same # --> should results in the same predictions np.testing.assert_array_almost_equal(y_valid, y_test) # since d2 provides perfect predictions # it should get a higher weight # so that y_valid should be exactly y_valid_d2 y_valid_d2 = ensbuilder.read_preds[d2][Y_VALID][:, 1] np.testing.assert_array_almost_equal(y_valid, y_valid_d2)
def testEntireEnsembleBuilder(ensemble_backend): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=2, ) ensbuilder.SAVE2DISC = False ensbuilder.compute_loss_per_model() d2 = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy") sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) > 0 ensemble = ensbuilder.fit_ensemble(selected_keys=sel_keys) print(ensemble, sel_keys) n_sel_valid, n_sel_test = ensbuilder.get_valid_test_preds( selected_keys=sel_keys) # both valid and test prediction files are available assert len(n_sel_valid) > 0 assert n_sel_valid == n_sel_test y_valid = ensbuilder.predict( set_="valid", ensemble=ensemble, selected_keys=n_sel_valid, n_preds=len(sel_keys), index_run=1, ) y_test = ensbuilder.predict( set_="test", ensemble=ensemble, selected_keys=n_sel_test, n_preds=len(sel_keys), index_run=1, ) # predictions for valid and test are the same # --> should results in the same predictions np.testing.assert_array_almost_equal(y_valid, y_test) # since d2 provides perfect predictions # it should get a higher weight # so that y_valid should be exactly y_valid_d2 y_valid_d2 = ensbuilder.read_preds[d2][Y_VALID][:, 1] np.testing.assert_array_almost_equal(y_valid, y_valid_d2)
def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_threshold, ensemble_nbest, exp): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, performance_range_threshold=performance_range_threshold, max_models_on_disc=None, ) ensbuilder.read_scores = { 'A': { 'ens_score': 1, 'num_run': 1, 'loaded': -1, "seed": 1 }, 'B': { 'ens_score': 2, 'num_run': 2, 'loaded': -1, "seed": 1 }, 'C': { 'ens_score': 3, 'num_run': 3, 'loaded': -1, "seed": 1 }, 'D': { 'ens_score': 4, 'num_run': 4, 'loaded': -1, "seed": 1 }, 'E': { 'ens_score': 5, 'num_run': 5, 'loaded': -1, "seed": 1 }, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)} for key in ensbuilder.read_scores } sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp
def testGetValidTestPreds(self): ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=1) ensbuilder.score_ensemble_preds() d1 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_1_0.0.npy" ) d2 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_2_0.0.npy" ) d3 = os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_3_100.0.npy" ) sel_keys = ensbuilder.get_n_best_preds() self.assertEqual(len(sel_keys), 1) ensbuilder.get_valid_test_preds(selected_keys=sel_keys) # Number of read files should be three and # predictions_ensemble_0_4_0.0.npy must not be in there self.assertEqual(len(ensbuilder.read_preds), 3) self.assertNotIn( os.path.join( self.backend.temporary_directory, ".auto-sklearn/predictions_ensemble/predictions_ensemble_0_4_0.0.npy" ), ensbuilder.read_preds) # not selected --> should still be None self.assertIsNone(ensbuilder.read_preds[d1][Y_VALID]) self.assertIsNone(ensbuilder.read_preds[d1][Y_TEST]) self.assertIsNone(ensbuilder.read_preds[d3][Y_VALID]) self.assertIsNone(ensbuilder.read_preds[d3][Y_TEST]) # selected --> read valid and test predictions self.assertIsNotNone(ensbuilder.read_preds[d2][Y_VALID]) self.assertIsNotNone(ensbuilder.read_preds[d2][Y_TEST])
def testMaxModelsOnDisc(ensemble_backend, test_case, exp): ensemble_nbest = 4 ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, max_models_on_disc=test_case, ) with unittest.mock.patch('os.path.getsize') as mock: mock.return_value = 100 * 1024 * 1024 ensbuilder.score_ensemble_preds() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp, test_case
def testGetValidTestPreds(ensemble_backend): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=1) ensbuilder.compute_loss_per_model() # d1 is a dummt prediction. d2 and d3 have the same prediction with # different name. num_run=2 is selected when doing sorted() d1 = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy") d2 = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy") d3 = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy") sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == 1 ensbuilder.get_valid_test_preds(selected_keys=sel_keys) # Number of read files should be three and # predictions_ensemble_0_4_0.0.npy must not be in there assert len(ensbuilder.read_preds) == 3 assert os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy" ) not in ensbuilder.read_preds # not selected --> should still be None assert ensbuilder.read_preds[d1][Y_VALID] is None assert ensbuilder.read_preds[d1][Y_TEST] is None assert ensbuilder.read_preds[d3][Y_VALID] is None assert ensbuilder.read_preds[d3][Y_TEST] is None # selected --> read valid and test predictions assert ensbuilder.read_preds[d2][Y_VALID] is not None assert ensbuilder.read_preds[d2][Y_TEST] is not None
def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, max_models_on_disc=max_models_on_disc, ) ensbuilder.score_ensemble_preds() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp fixture = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy") assert sel_keys[0] == fixture
def testFallBackNBest(ensemble_backend): ensbuilder = EnsembleBuilder( backend=ensemble_backend, dataset_name="TEST", task_type=BINARY_CLASSIFICATION, metric=roc_auc, seed=0, # important to find the test files ensemble_nbest=1) ensbuilder.score_ensemble_preds() print() print(ensbuilder.read_preds.keys()) print(ensbuilder.read_scores.keys()) print(ensemble_backend.temporary_directory) filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy") ensbuilder.read_scores[filename]["ens_score"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy") ensbuilder.read_scores[filename]["ens_score"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy") ensbuilder.read_scores[filename]["ens_score"] = -1 sel_keys = ensbuilder.get_n_best_preds() fixture = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy") assert len(sel_keys) == 1 assert sel_keys[0] == fixture
def testMaxModelsOnDisc(self): ensemble_nbest = 4 for (test_case, exp) in [ # If None, no reduction (None, 2), # If Int, limit only on exceed (4, 2), (1, 1), # If Float, translate float to # models. # below, mock of each file is 100 Mb and # 4 files .model and .npy (test/val/pred) exist (700.0, 1), (800.0, 2), (9999.0, 2), ]: ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=ensemble_nbest, max_models_on_disc=test_case, ) with unittest.mock.patch('os.path.getsize') as mock: mock.return_value = 100 * 1024 * 1024 ensbuilder.score_ensemble_preds() sel_keys = ensbuilder.get_n_best_preds() self.assertEqual(len(sel_keys), exp) # Test for Extreme scenarios # Make sure that the best predictions are kept ensbuilder = EnsembleBuilder( backend=self.backend, dataset_name="TEST", task_type=1, # Binary Classification metric=roc_auc, limit=-1, # not used, seed=0, # important to find the test files ensemble_nbest=50, max_models_on_disc=10000.0, ) ensbuilder.read_preds = {} for i in range(50): ensbuilder.read_preds['pred' + str(i)] = { 'ens_score': i * 10, 'num_run': i, 0: True, 'loaded': 1, "seed": 1, "disc_space_cost_mb": 50 * i, } sel_keys = ensbuilder.get_n_best_preds() self.assertListEqual(['pred49', 'pred48', 'pred47', 'pred46'], sel_keys) # Make sure at least one model is kept alive ensbuilder.max_models_on_disc = 0.0 sel_keys = ensbuilder.get_n_best_preds() self.assertListEqual(['pred49'], sel_keys)