def __init__( self, temporary_directory: str, output_directory: Optional[str], delete_tmp_folder_after_terminate: bool, delete_output_folder_after_terminate: bool, ): # Check that the names of tmp_dir and output_dir is not the same. if temporary_directory == output_directory and temporary_directory is not None: raise ValueError("The temporary and the output directory " "must be different.") self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = delete_output_folder_after_terminate # attributes to check that directories were created by autosklearn. self._tmp_dir_created = False self._output_dir_created = False self._temporary_directory = (get_randomized_directory_name( temporary_directory=temporary_directory, )) self._output_directory = output_directory self.create_directories() # This is the first place the logger gets created. # We want to make sure any logging forward sets the correct directory # were all files should be created logging.setup_logger(output_dir=self._temporary_directory) self._logger = logging.get_logger(__name__)
def __init__(self, autosklearn_tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32", low_precision=True): super(EnsembleBuilder, self).__init__() self.autosklearn_tmp_dir = autosklearn_tmp_dir self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.limit = limit self.output_dir = output_dir self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.seed = seed self.shared_mode = shared_mode self.max_iterations = max_iterations self.precision = precision self.low_precision = low_precision logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) setup_logger( os.path.join(self.autosklearn_tmp_dir, '%s.log' % str(logger_name))) self.logger = get_logger(logger_name)
def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def __init__(self, backend, dataset_name, task_type, metric, limit, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32", low_precision=True): super(EnsembleBuilder, self).__init__() self.backend = backend self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.limit = limit self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.seed = seed self.shared_mode = shared_mode self.max_iterations = max_iterations self.precision = precision self.low_precision = low_precision logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) self.logger = get_logger(logger_name)
def __init__(self, temporary_directory, output_directory, delete_tmp_folder_after_terminate, delete_output_folder_after_terminate, shared_mode=False): # Check that the names of tmp_dir and output_dir is not the same. if temporary_directory == output_directory and temporary_directory is not None: raise ValueError("The temporary and the output directory " "must be different.") self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = delete_output_folder_after_terminate self.shared_mode = shared_mode # attributes to check that directories were created by autosklearn. self._tmp_dir_created = False self._output_dir_created = False self.__temporary_directory, self.__output_directory = ( get_randomized_directory_names( temporary_directory=temporary_directory, output_directory=output_directory, )) self._logger = logging.get_logger(__name__) self.create_directories()
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: backend_api = self._create_backend('test_do_dummy_prediction') dataset = os.path.join(self.test_dir, '..', '.data', name) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. self.assertFalse( os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue( os.path.exists( os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_1_0.0.npy'))) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def __init__(self, temporary_directory, output_directory, delete_tmp_folder_after_terminate, delete_output_folder_after_terminate): self._prepare_directories(temporary_directory, output_directory) self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = delete_output_folder_after_terminate self._logger = logging.get_logger(__name__) self.create_directories()
def __init__( self, Datamanager, backend, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None, subsample=None, ): self.starttime = time.time() self.configuration = configuration self.backend = backend self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = 0 self.num_run = num_run self.subsample = subsample self.model = self.model_class(self.configuration, self.seed) logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.D.name) self.logger = get_logger(logger_name)
def run_smac(tmp_dir, basename, time_for_task, ml_memory_limit, data_manager_path, configspace_path, initial_configurations, per_run_time_limit, watcher, backend, seed, resampling_strategy, resampling_strategy_arguments, shared_mode): logger = logging.get_logger(__name__) task_name = 'runSmac' watcher.start_task(task_name) instance_file_path, test_instance_file_path = \ _write_instance_file(resampling_strategy, resampling_strategy_arguments, data_manager_path, backend, tmp_dir) scenario_file_path = _write_scenario_file(time_for_task, per_run_time_limit, ml_memory_limit, tmp_dir, configspace_path, instance_file_path, test_instance_file_path, basename) # = Start SMAC time_smac = max(0, time_for_task - watcher.wall_elapsed(basename)) if time_smac <= 0: logger.info('No time left for SMAC') return logger.info('Start SMAC with %5.2fsec time left' % time_smac) initial_challengers = initial_configurations if initial_challengers is None: initial_challengers = [] smac_options = { 'retryTargetAlgorithmRunCount': '0', 'intensification-percentage': '0.5', 'num-ei-random': '1000', 'num-challengers': 100, 'initial-incumbent': 'DEFAULT', 'validation': 'false' } if shared_mode: smac_options['shared-model-mode'] = 'true' smac_options['shared-model-mode-frequency'] = '300' call = ' '.join(['smac', '--numRun', str(seed), '--scenario', scenario_file_path] + ['--%s %s' % (opt, smac_options[opt]) for opt in smac_options] + initial_challengers, ) proc = submit_call(call, seed, logger) watcher.stop_task(task_name) return proc
def run_smac(tmp_dir, basename, time_for_task, ml_memory_limit, data_manager_path, configspace_path, initial_configurations, per_run_time_limit, watcher, backend, seed, resampling_strategy, resampling_strategy_arguments, shared_mode): logger = logging.get_logger(__name__) task_name = 'runSmac' watcher.start_task(task_name) instance_file_path, test_instance_file_path = \ _write_instance_file(resampling_strategy, resampling_strategy_arguments, data_manager_path, backend, tmp_dir) scenario_file_path = _write_scenario_file( time_for_task, per_run_time_limit, ml_memory_limit, tmp_dir, configspace_path, instance_file_path, test_instance_file_path, basename) # = Start SMAC time_smac = max(0, time_for_task - watcher.wall_elapsed(basename)) if time_smac <= 0: logger.info('No time left for SMAC') return logger.info('Start SMAC with %5.2fsec time left' % time_smac) initial_challengers = initial_configurations if initial_challengers is None: initial_challengers = [] smac_options = { 'retryTargetAlgorithmRunCount': '0', 'intensification-percentage': '0.5', 'num-ei-random': '1000', 'num-challengers': 100, 'initial-incumbent': 'DEFAULT', 'validation': 'false' } if shared_mode: smac_options['shared-model-mode'] = 'true' smac_options['shared-model-mode-frequency'] = '300' call = ' '.join( ['smac', '--numRun', str(seed), '--scenario', scenario_file_path] + ['--%s %s' % (opt, smac_options[opt]) for opt in smac_options] + initial_challengers, ) proc = submit_call(call, seed, logger) watcher.stop_task(task_name) return proc
def __init__(self, Datamanager, backend, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None, subsample=None,): self.starttime = time.time() self.configuration = configuration self.backend = backend self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = 0 self.num_run = num_run self.subsample = subsample self.model = self.model_class(self.configuration, self.seed) logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.D.name) self.logger = get_logger(logger_name)
def test_do_dummy_prediction(self): datasets = { 'breast_cancer': BINARY_CLASSIFICATION, 'wine': MULTICLASS_CLASSIFICATION, 'diabetes': REGRESSION, } for name, task in datasets.items(): backend_api = self._create_backend('test_do_dummy_prediction') X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend.save_datamanager(datamanager) D = backend_api.load_datamanager() # Check if data manager is correcly loaded self.assertEqual(D.info['task'], datamanager.info['task']) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. self.assertFalse( os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue( os.path.exists( os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_1_0.0.npy'))) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def run_ensemble_builder(tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size, ensemble_nbest, seed, shared_mode, max_iterations, precision): logger = logging.get_logger(__name__) if limit <= 0 and (max_iterations is None or max_iterations <= 0): logger.warning("Not starting ensemble builder because it's not worth " "it.") # It makes no sense to start building ensembles_statistics return ensemble_script = 'python -m autosklearn.ensemble_selection_script' runsolver_exec = 'runsolver' delay = 5 task_type = TASK_TYPES_TO_STRING[task_type] metric = METRIC_TO_STRING[metric] call = [ensemble_script, '--auto-sklearn-tmp-directory', tmp_dir, '--basename', dataset_name, '--task', task_type, '--metric', metric, '--limit', str(limit - 5), '--output-directory', output_dir, '--ensemble-size', str(ensemble_size), '--ensemble-nbest', str(ensemble_nbest), '--auto-sklearn-seed', str(seed), '--max-iterations', str(max_iterations), '--precision', str(precision)] if shared_mode: call.append('--shared-mode') call = ' '.join(call) # Runsolver does strange things if the time limit is negative. Set it to # be at least one (0 means infinity) if limit <= 0: limit = 0 else: limit = max(1, limit) # Now add runsolver command # runsolver_cmd = "%s --watcher-data /dev/null -W %d" % \ # (runsolver_exec, limit) runsolver_cmd = '%s --watcher-data /dev/null -W %d -d %d' % \ (runsolver_exec, limit, delay) call = runsolver_cmd + ' ' + call proc = submit_call(call, seed, logger, log_dir=tmp_dir) return proc
def run_ensemble_builder(tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size, ensemble_nbest, seed, shared_mode, max_iterations, precision): logger = logging.get_logger(__name__) if limit <= 0 and (max_iterations is None or max_iterations <= 0): logger.warning("Not starting ensemble builder because it's not worth " "it.") # It makes no sense to start building ensembles_statistics return ensemble_script = 'python -m autosklearn.ensemble_selection_script' runsolver_exec = 'runsolver' delay = 5 task_type = TASK_TYPES_TO_STRING[task_type] metric = METRIC_TO_STRING[metric] call = [ ensemble_script, '--auto-sklearn-tmp-directory', tmp_dir, '--dataset_name', dataset_name, '--task', task_type, '--metric', metric, '--limit', str(limit - 5), '--output-directory', output_dir, '--ensemble-size', str(ensemble_size), '--ensemble-nbest', str(ensemble_nbest), '--auto-sklearn-seed', str(seed), '--max-iterations', str(max_iterations), '--precision', str(precision) ] if shared_mode: call.append('--shared-mode') call = ' '.join(call) # Runsolver does strange things if the time limit is negative. Set it to # be at least one (0 means infinity) if limit <= 0: limit = 0 else: limit = max(1, limit) # Now add runsolver command # runsolver_cmd = "%s --watcher-data /dev/null -W %d" % \ # (runsolver_exec, limit) runsolver_cmd = '%s --watcher-data /dev/null -W %d -d %d' % \ (runsolver_exec, limit, delay) call = runsolver_cmd + ' ' + call proc = submit_call(call, seed, logger, log_dir=tmp_dir) return proc
def __init__(self, context): self.logger = logging.get_logger(__name__) self.context = context # Create the temporary directory if it does not yet exist try: os.makedirs(self.temporary_directory) except Exception: pass # This does not have to exist or be specified if self.output_directory is not None: if not os.path.exists(self.output_directory): raise ValueError("Output directory %s does not exist." % self.output_directory) self.internals_directory = os.path.join(self.temporary_directory, ".auto-sklearn") self._make_internals_directory()
def __init__(self, dataset_name, configuration_space, meta_base, distance='l1', seed=None, use_features=None, distance_kwargs=None): self.dataset_name = dataset_name self.configuration_space = configuration_space self.meta_base = meta_base self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.kND = None # For caching, makes things faster... self.logger = get_logger(__name__)
def test_exceptions_inside_log_in_smbo(self, smbo_run_mock): # Make sure that any exception during the AutoML fit due to # SMAC are properly captured in a log file backend_api = self._create_backend('test_exceptions_inside_log') self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory) automl = autosklearn.automl.AutoML( backend_api, 20, 5, metric=accuracy, ) output_file = 'test_exceptions_inside_log.log' setup_logger(output_file=output_file) logger = get_logger('test_exceptions_inside_log') # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with unittest.mock.patch( 'autosklearn.automl.AutoML._get_logger') as mock: mock.return_value = logger with self.assertRaises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) with open(output_file) as f: self.assertTrue(message in f.read()) # Cleanup os.unlink(output_file) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit_roar(self): def get_roar_object_callback( scenario_dict, seed, ta, ta_kwargs, **kwargs ): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, tae_runner_kwargs=ta_kwargs, ) backend_api = self._create_backend('test_fit_roar') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, metric=accuracy, ) setup_logger() automl._logger = get_logger('test_fit_roar') automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_do_dummy_prediction(backend, dask_client, datasets): name, task = datasets X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, ) setup_logger(backend.temporary_directory) auto._logger = get_logger('test_do_dummy_predictions') auto._backend.save_datamanager(datamanager) D = backend.load_datamanager() # Check if data manager is correcly loaded assert D.info['task'] == datamanager.info['task'] auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) assert os.path.exists( os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy')) del auto
def suggest_via_metalearning(meta_base, dataset_name, metric, task, sparse, num_initial_configurations): logger = get_logger('autosklearn.metalearning.mismbo') if task == MULTILABEL_CLASSIFICATION: task = MULTICLASS_CLASSIFICATION task = TASK_TYPES_TO_STRING[task] logger.info(task) start = time.time() ml = MetaLearningOptimizer( dataset_name=dataset_name, configuration_space=meta_base.configuration_space, meta_base=meta_base, distance='l1', seed=1, ) logger.info('Reading meta-data took %5.2f seconds', time.time() - start) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) return runs[:num_initial_configurations]
def test_fit_roar(dask_client_single_worker, backend): def get_roar_object_callback(scenario_dict, seed, ta, ta_kwargs, dask_client, n_jobs, **kwargs): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, tae_runner_kwargs=ta_kwargs, dask_client=dask_client, n_jobs=n_jobs, ) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, metric=accuracy, dask_client=dask_client_single_worker, ) setup_logger() automl._logger = get_logger('test_fit_roar') automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) assert score > 0.8 assert count_succeses(automl.cv_results_) > 0 assert automl._task == MULTICLASS_CLASSIFICATION del automl
def __init__(self, temporary_directory, output_directory, delete_tmp_folder_after_terminate, delete_output_folder_after_terminate, shared_mode=False): # Check that the names of tmp_dir and output_dir is not the same. if temporary_directory == output_directory \ and temporary_directory is not None: raise ValueError("The temporary and the output directory " "must be different.") self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = delete_output_folder_after_terminate self.shared_mode = shared_mode # attributes to check that directories were created by autosklearn. self._tmp_dir_created = False self._output_dir_created = False self._prepare_directories(temporary_directory, output_directory) self._logger = logging.get_logger(__name__) self.create_directories()
def __init__(self, autosklearn_tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32", low_precision=True): super(EnsembleBuilder, self).__init__() self.autosklearn_tmp_dir = autosklearn_tmp_dir self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.limit = limit self.output_dir = output_dir self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.seed = seed self.shared_mode = shared_mode self.max_iterations = max_iterations self.precision = precision self.low_precision = low_precision logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) setup_logger(os.path.join(self.autosklearn_tmp_dir, '%s.log' % str(logger_name))) self.logger = get_logger(logger_name)
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client): automl = autosklearn.automl.AutoML( backend, 20, 5, metric=accuracy, dask_client=dask_client, ) output_file = 'test_exceptions_inside_log.log' setup_logger(output_file=output_file) logger = get_logger('test_exceptions_inside_log') # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with unittest.mock.patch('autosklearn.automl.AutoML._get_logger') as mock: mock.return_value = logger with pytest.raises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) with open(output_file) as f: assert message in f.read() # Cleanup os.unlink(output_file)
def __init__( self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, metric, watcher, n_jobs, dask_client: dask.distributed.Client, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None, scoring_functions=None, ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, ): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name self.datamanager = None self.metric = metric self.task = None self.backend = backend # the configuration space self.config_space = config_space # the number of parallel workers/jobs self.n_jobs = n_jobs self.dask_client = dask_client # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.worst_possible_result = get_cost_of_crash(self.metric) self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.start_num_run = start_num_run self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback self.scoring_functions = scoring_functions self.ensemble_callback = ensemble_callback dataset_name_ = "" if dataset_name is None else dataset_name logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_) self.logger = get_logger(logger_name)
def __init__( self, backend: Backend, dataset_name: str, task_type: int, metric: str, limit: int, ensemble_size: int=10, ensemble_nbest: int=100, seed: int=1, shared_mode: bool=False, max_iterations: int=None, precision: str="32", sleep_duration: int=2, memory_limit: int=1000, read_at_most: int=5, ): """ Constructor Parameters ---------- backend: util.backend.Backend backend to write and read files dataset_name: str name of dataset task_type: int type of ML task metric: str name of metric to score predictions limit: int time limit in sec ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) ensemble_nbest: int consider only the n best prediction (wrt validation predictions) seed: int random seed if set to -1, read files with any seed (e.g., for shared model mode) shared_model: bool auto-sklearn used shared model mode (aka pSMAC) max_iterations: int maximal number of iterations to run this script (default None --> deactivated) precision: ["16","32","64","128"] precision of floats to read the predictions sleep_duration: int duration of sleeping time between two iterations of this script (in sec) memory_limit: int memory limit in mb read_at_most: int read at most n new prediction files in each iteration """ super(EnsembleBuilder, self).__init__() self.backend = backend # communication with filesystem self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.time_limit = limit # time limit self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest # max number of members that will be used for building the ensemble self.seed = seed self.shared_mode = shared_mode # pSMAC? self.max_iterations = max_iterations self.precision = precision self.sleep_duration = sleep_duration self.memory_limit = memory_limit self.read_at_most = read_at_most # part of the original training set # used to build the ensemble self.dir_ensemble = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_ensemble', ) # validation set (public test set) -- y_true not known self.dir_valid = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_valid', ) # test set (private test set) -- y_true not known self.dir_test = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_test', ) logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) self.logger = get_logger(logger_name) self.start_time = 0 self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy') # already read prediction files # {"file name": { # "ens_score": float # "mtime_ens": str, # "mtime_valid": str, # "mtime_test": str, # "seed": int, # "num_run": int, # Y_ENSEMBLE: np.ndarray # Y_VALID: np.ndarray # Y_TEST: np.ndarray # } self.read_preds = {} self.last_hash = None # hash of ensemble training data self.y_true_ensemble = None self.SAVE2DISC = True self.validation_performance_ = np.inf
def __init__( self, backend: Backend, dataset_name: str, task_type: int, metric: str, limit: int, ensemble_size: int = 10, ensemble_nbest: int = 100, seed: int = 1, shared_mode: bool = False, max_iterations: int = None, precision: str = "32", sleep_duration: int = 2, memory_limit: int = 1000, read_at_most: int = 5, ): """ Constructor Parameters ---------- backend: util.backend.Backend backend to write and read files dataset_name: str name of dataset task_type: int type of ML task metric: str name of metric to score predictions limit: int time limit in sec ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) ensemble_nbest: int consider only the n best prediction (wrt validation predictions) seed: int random seed if set to -1, read files with any seed (e.g., for shared model mode) shared_model: bool auto-sklearn used shared model mode (aka pSMAC) max_iterations: int maximal number of iterations to run this script (default None --> deactivated) precision: ["16","32","64","128"] precision of floats to read the predictions sleep_duration: int duration of sleeping time between two iterations of this script (in sec) memory_limit: int memory limit in mb read_at_most: int read at most n new prediction files in each iteration """ super(EnsembleBuilder, self).__init__() self.backend = backend # communication with filesystem self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.time_limit = limit # time limit self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest # max number of members that will be used for building the ensemble self.seed = seed self.shared_mode = shared_mode # pSMAC? self.max_iterations = max_iterations self.precision = precision self.sleep_duration = sleep_duration self.memory_limit = memory_limit self.read_at_most = read_at_most # part of the original training set # used to build the ensemble self.dir_ensemble = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_ensemble', ) # validation set (public test set) -- y_true not known self.dir_valid = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_valid', ) # test set (private test set) -- y_true not known self.dir_test = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_test', ) logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) self.logger = get_logger(logger_name) self.start_time = 0 self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy') # already read prediction files # {"file name": { # "ens_score": float # "mtime_ens": str, # "mtime_valid": str, # "mtime_test": str, # "seed": int, # "num_run": int, # Y_ENSEMBLE: np.ndarray # Y_VALID: np.ndarray # Y_TEST: np.ndarray # } self.read_preds = {} self.last_hash = None # hash of ensemble training data self.y_true_ensemble = None self.SAVE2DISC = True self.validation_performance_ = np.inf
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) DPP = DataPreprocessor(categorical_features=categorical, force_sparse_output=True) X_transformed = DPP.fit_transform(X) categorical_transformed = [False] * X_transformed.shape[1] # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def __init__(self, backend, queue, metric, configuration=None, all_scoring_functions=False, seed=1, output_y_hat_optimization=True, num_run=None, subsample=None, include=None, exclude=None, disable_file_output=False, init_params=None): self.starttime = time.time() self.configuration = configuration self.backend = backend self.queue = queue self.datamanager = self.backend.load_datamanager() self.include = include self.exclude = exclude self.X_valid = self.datamanager.data.get('X_valid') self.y_valid = self.datamanager.data.get('Y_valid') self.X_test = self.datamanager.data.get('X_test') self.y_test = self.datamanager.data.get('Y_test') self.metric = metric self.task_type = self.datamanager.info['task'] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization self.all_scoring_functions = all_scoring_functions self.disable_file_output = disable_file_output if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = ( autosklearn.pipeline.classification. SimpleClassificationPipeline ) self.predict_function = self._predict_proba categorical_mask = [] for feat in self.datamanager.feat_type: if feat.lower() == 'numerical': categorical_mask.append(False) elif feat.lower() == 'categorical': categorical_mask.append(True) else: raise ValueError(feat) if np.sum(categorical_mask) > 0: self._init_params = { 'categorical_encoding:one_hot_encoding:categorical_features': categorical_mask } else: self._init_params = {} if init_params is not None: self._init_params.update(init_params) if num_run is None: num_run = 0 self.num_run = num_run self.subsample = subsample logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.datamanager.name) self.logger = get_logger(logger_name) self.Y_optimization = None self.Y_actual_train = None
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def __init__(self): self.logger = get_logger(__name__)
def __init__(self, backend, queue, metric, configuration=None, all_scoring_functions=False, seed=1, output_y_hat_optimization=True, num_run=None, include=None, exclude=None, disable_file_output=False, init_params=None, budget=None, budget_type=None): self.starttime = time.time() self.configuration = configuration self.backend = backend self.queue = queue self.datamanager = self.backend.load_datamanager() self.include = include self.exclude = exclude self.X_valid = self.datamanager.data.get('X_valid') self.y_valid = self.datamanager.data.get('Y_valid') self.X_test = self.datamanager.data.get('X_test') self.y_test = self.datamanager.data.get('Y_test') self.metric = metric self.task_type = self.datamanager.info['task'] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization self.all_scoring_functions = all_scoring_functions if isinstance(disable_file_output, (bool, list)): self.disable_file_output = disable_file_output else: raise ValueError( 'disable_file_output should be either a bool or a list') if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba categorical_mask = [] for feat in self.datamanager.feat_type: if feat.lower() == 'numerical': categorical_mask.append(False) elif feat.lower() == 'categorical': categorical_mask.append(True) else: raise ValueError(feat) if np.sum(categorical_mask) > 0: self._init_params = { 'data_preprocessing:categorical_features': categorical_mask } else: self._init_params = {} if init_params is not None: self._init_params.update(init_params) if num_run is None: num_run = 0 self.num_run = num_run logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.datamanager.name) self.logger = get_logger(logger_name) self.Y_optimization = None self.Y_actual_train = None self.budget = budget self.budget_type = budget_type
import os from StringIO import StringIO import time import numpy as np import pyMetaLearn.metafeatures.metafeatures as metafeatures import pyMetaLearn.optimizers.metalearn_optimizer.metalearner as \ metalearner from autosklearn.util import logging_ from autosklearn.constants import * logger = logging_.get_logger(__name__) class MetaLearning(object): """Right now, pyMetaLearn performs a OneHotEncoding if necessary, but it is really not necessary. This object helps to circumvent this by: 1. call metafeatures.calculate_all_metafeatures() only for the metafeatures which do not need OneHotEncoded data 2. Allows the caller to then perform a OneHotEncoding 3. call metafeatures.calculate_metafeatures_encoded_labels() for all other metafeatures need OneHotEncoded data. """ def __init__(self): self._sentinel = "uiaeo" self._metafeatures_encoded_labels = None self._metafeatures_labels = None # Hard-coded list of too-expensive metafeatures! self._exclude_metafeatures = set([
def test_fail_if_dummy_prediction_fails(self, ta_run_mock): backend_api = self._create_backend('test_fail_if_dummy_prediction_fails') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=2, feat_type=['Numerical' for i in range(X_train.shape[1])], dataset_name='iris', ) time_for_this_task = 30 per_run_time = 10 auto = autosklearn.automl.AutoML(backend_api, time_for_this_task, per_run_time, initial_configurations_via_metalearning=25, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_fail_if_dummy_prediction_fails') auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) # First of all, check that ta.run() is actually called. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" auto._do_dummy_prediction(datamanager, 1) ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task) # Case 1. Check that function raises no error when statustype == success. # ta.run() returns status, cost, runtime, and additional info. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" raised = False try: auto._do_dummy_prediction(datamanager, 1) except ValueError: raised = True self.assertFalse(raised, 'Exception raised') # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.CRASHED ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.ABORT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.ABORT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.TIMEOUT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.MEMOUT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.CAPPED, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.CAPPED ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=2, feat_type=['Numerical' for i in range(X_train.shape[1])], dataset_name='iris', ) time_for_this_task = 30 per_run_time = 10 auto = autosklearn.automl.AutoML( backend, time_for_this_task, per_run_time, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, ) setup_logger() auto._logger = get_logger('test_fail_if_dummy_prediction_fails') auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) # First of all, check that ta.run() is actually called. ta_run_mock.return_value = StatusType.SUCCESS, None, None, {} auto._do_dummy_prediction(datamanager, 1) ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task) # Case 1. Check that function raises no error when statustype == success. # ta.run() returns status, cost, runtime, and additional info. ta_run_mock.return_value = StatusType.SUCCESS, None, None, {} raised = False try: auto._do_dummy_prediction(datamanager, 1) except ValueError: raised = True assert not raised, 'Exception raised' # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, {} with pytest.raises( ValueError, match= 'Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.' # noqa ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.ABORT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.ABORT ' 'and additional output: {}.', ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.TIMEOUT ' 'and additional output: {}.'): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.MEMOUT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.MEMOUT ' 'and additional output: {}.', ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.CAPPED, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.CAPPED ' 'and additional output: {}.'): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6} with pytest.raises( ValueError, match= 'The error suggests that the provided memory limits were too tight.', ): auto._do_dummy_prediction(datamanager, 1)
def __init__(self, backend: Backend, dataset_name: str, task_type: int, metric: Scorer, limit: int, ensemble_size: int = 10, ensemble_nbest: int = 100, max_models_on_disc: int = 100, performance_range_threshold: float = 0, seed: int = 1, shared_mode: bool = False, max_iterations: int = None, precision: int = 32, sleep_duration: int = 2, memory_limit: Optional[int] = 1024, read_at_most: int = 5, random_state: Optional[Union[int, np.random.RandomState]] = None, queue: multiprocessing.Queue = None): """ Constructor Parameters ---------- backend: util.backend.Backend backend to write and read files dataset_name: str name of dataset task_type: int type of ML task metric: str name of metric to score predictions limit: int time limit in sec ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) ensemble_nbest: int/float if int: consider only the n best prediction if float: consider only this fraction of the best models Both wrt to validation predictions If performance_range_threshold > 0, might return less models max_models_on_disc: int Defines the maximum number of models that are kept in the disc. If int, it must be greater or equal than 1, and dictates the max number of models to keep. If float, it will be interpreted as the max megabytes allowed of disc space. That is, if the number of ensemble candidates require more disc space than this float value, the worst models will be deleted to keep within this budget. Models and predictions of the worst-performing models will be deleted then. If None, the feature is disabled. It defines an upper bound on the models that can be used in the ensemble. performance_range_threshold: float Keep only models that are better than: dummy + (best - dummy)*performance_range_threshold E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3 Will at most return the minimum between ensemble_nbest models, and max_models_on_disc. Might return less seed: int random seed if set to -1, read files with any seed (e.g., for shared model mode) shared_model: bool auto-sklearn used shared model mode (aka pSMAC) max_iterations: int maximal number of iterations to run this script (default None --> deactivated) precision: [16,32,64,128] precision of floats to read the predictions sleep_duration: int duration of sleeping time between two iterations of this script (in sec) memory_limit: Optional[int] memory limit in mb. If ``None``, no memory limit is enforced. read_at_most: int read at most n new prediction files in each iteration """ super(EnsembleBuilder, self).__init__() self.backend = backend # communication with filesystem self.dataset_name = dataset_name self.task_type = task_type self.metric = metric self.time_limit = limit # time limit self.ensemble_size = ensemble_size self.performance_range_threshold = performance_range_threshold if isinstance(ensemble_nbest, numbers.Integral) and ensemble_nbest < 1: raise ValueError("Integer ensemble_nbest has to be larger 1: %s" % ensemble_nbest) elif not isinstance(ensemble_nbest, numbers.Integral): if ensemble_nbest < 0 or ensemble_nbest > 1: raise ValueError( "Float ensemble_nbest best has to be >= 0 and <= 1: %s" % ensemble_nbest) self.ensemble_nbest = ensemble_nbest # max_models_on_disc can be a float, in such case we need to # remember the user specified Megabytes and translate this to # max number of ensemble models. max_resident_models keeps the # maximum number of models in disc if max_models_on_disc is not None and max_models_on_disc < 0: raise ValueError( "max_models_on_disc has to be a positive number or None") self.max_models_on_disc = max_models_on_disc self.max_resident_models = None self.seed = seed self.shared_mode = shared_mode # pSMAC? self.max_iterations = max_iterations self.precision = precision self.sleep_duration = sleep_duration self.memory_limit = memory_limit self.read_at_most = read_at_most self.random_state = check_random_state(random_state) # part of the original training set # used to build the ensemble self.dir_ensemble = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_ensemble', ) # validation set (public test set) -- y_true not known self.dir_valid = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_valid', ) # test set (private test set) -- y_true not known self.dir_test = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'predictions_test', ) self.dir_models = os.path.join( self.backend.temporary_directory, '.auto-sklearn', 'models', ) logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name) self.logger = get_logger(logger_name) if ensemble_nbest == 1: self.logger.debug( "Behaviour depends on int/float: %s, %s (ensemble_nbest, type)" % (ensemble_nbest, type(ensemble_nbest))) self.start_time = 0 self.model_fn_re = re.compile( r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy') # already read prediction files # {"file name": { # "ens_score": float # "mtime_ens": str, # "mtime_valid": str, # "mtime_test": str, # "seed": int, # "num_run": int, # "deleted": bool, # Y_ENSEMBLE: np.ndarray # Y_VALID: np.ndarray # Y_TEST: np.ndarray # } # } self.read_preds = {} self.last_hash = None # hash of ensemble training data self.y_true_ensemble = None self.SAVE2DISC = True # hidden feature which can be activated via an environment variable. This keeps all # models and predictions which have ever been a candidate. This is necessary to post-hoc # compute the whole ensemble building trajectory. self._has_been_candidate = set() self.validation_performance_ = np.inf # Track the ensemble performance self.datamanager = self.backend.load_datamanager() self.y_valid = self.datamanager.data.get('Y_valid') self.y_test = self.datamanager.data.get('Y_test') # Support for tracking the performance across time # A Queue is needed to handle multiprocessing, not only # internally for pynisher calls, but to return data # to the main process # Hence, because we are using three different processes, # the below strategy prevents MemoryErrors. That is, # without clearly isolating the queue with a manger, # we run into a threading MemoryError if queue is None: mgr = multiprocessing.Manager() mgr.Namespace() self.queue = mgr.Queue() else: self.queue = queue self.queue.put([]) self.queue.get()
def test_automl_outputs(self): backend_api = self._create_backend('test_automl_outputs') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') name = 'iris' data_manager_file = os.path.join( backend_api.temporary_directory, '.auto-sklearn', 'datamanager.pkl' ) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=0, seed=100, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_automl_outputs') auto.fit( X=X_train, y=Y_train, X_test=X_test, y_test=Y_test, dataset_name=name, task=MULTICLASS_CLASSIFICATION, ) # pickled data manager (without one hot encoding!) with open(data_manager_file, 'rb') as fh: D = pickle.load(fh) self.assertTrue(np.allclose(D.data['X_train'], X_train)) # Check that all directories are there fixture = ['cv_models', 'true_targets_ensemble.npy', 'start_time_100', 'datamanager.pkl', 'predictions_ensemble', 'ensembles', 'predictions_test', 'models'] self.assertEqual(sorted(os.listdir(os.path.join(backend_api.temporary_directory, '.auto-sklearn'))), sorted(fixture)) # At least one ensemble, one validation, one test prediction and one # model and one ensemble fixture = os.listdir(os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble')) self.assertGreater(len(fixture), 0) fixture = glob.glob(os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'models', '100.*.model')) self.assertGreater(len(fixture), 0) fixture = os.listdir(os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'ensembles')) self.assertIn('100.0000000001.ensemble', fixture) # Start time start_time_file_path = os.path.join(backend_api.temporary_directory, '.auto-sklearn', "start_time_100") with open(start_time_file_path, 'r') as fh: start_time = float(fh.read()) self.assertGreaterEqual(time.time() - start_time, 10) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_automl_outputs(backend, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') name = 'iris' data_manager_file = os.path.join(backend.temporary_directory, '.auto-sklearn', 'datamanager.pkl') auto = autosklearn.automl.AutoML( backend, 30, 5, initial_configurations_via_metalearning=0, seed=100, metric=accuracy, dask_client=dask_client, ) setup_logger() auto._logger = get_logger('test_automl_outputs') auto.fit( X=X_train, y=Y_train, X_test=X_test, y_test=Y_test, dataset_name=name, task=MULTICLASS_CLASSIFICATION, ) # Log file path log_file_path = glob.glob( os.path.join(backend.temporary_directory, 'AutoML*.log'))[0] # pickled data manager (without one hot encoding!) with open(data_manager_file, 'rb') as fh: D = pickle.load(fh) assert np.allclose(D.data['X_train'], X_train) # Check that all directories are there fixture = [ 'true_targets_ensemble.npy', 'start_time_100', 'datamanager.pkl', 'ensemble_read_preds.pkl', 'ensemble_read_scores.pkl', 'runs', 'ensembles', ] assert (sorted( os.listdir(os.path.join(backend.temporary_directory, '.auto-sklearn'))) == sorted(fixture)) # At least one ensemble, one validation, one test prediction and one # model and one ensemble fixture = glob.glob( os.path.join( backend.temporary_directory, '.auto-sklearn', 'runs', '*', 'predictions_ensemble*npy', )) assert len(fixture) > 0 fixture = glob.glob( os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '*', '100.*.model')) assert len(fixture) > 0 fixture = os.listdir( os.path.join(backend.temporary_directory, '.auto-sklearn', 'ensembles')) assert '100.0000000000.ensemble' in fixture # Start time start_time_file_path = os.path.join(backend.temporary_directory, '.auto-sklearn', "start_time_100") with open(start_time_file_path, 'r') as fh: start_time = float(fh.read()) assert time.time() - start_time >= 10, extract_msg_from_log(log_file_path) del auto
def __init__(self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, metric, watcher, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, shared_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name self.datamanager = None self.metric = metric self.task = None self.backend = backend # the configuration space self.config_space = config_space # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.start_num_run = start_num_run self.shared_mode = shared_mode self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback dataset_name_ = "" if dataset_name is None else dataset_name logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_) self.logger = get_logger(logger_name)
import os from StringIO import StringIO import time import numpy as np import pyMetaLearn.metafeatures.metafeatures as metafeatures import pyMetaLearn.optimizers.metalearn_optimizer.metalearner as \ metalearner from autosklearn.util import logging_ from autosklearn.constants import * logger = logging_.get_logger(__name__) class MetaLearning(object): """Right now, pyMetaLearn performs a OneHotEncoding if necessary, but it is really not necessary. This object helps to circumvent this by: 1. call metafeatures.calculate_all_metafeatures() only for the metafeatures which do not need OneHotEncoded data 2. Allows the caller to then perform a OneHotEncoding 3. call metafeatures.calculate_metafeatures_encoded_labels() for all other metafeatures need OneHotEncoded data. """ def __init__(self): self._sentinel = "uiaeo" self._metafeatures_encoded_labels = None self._metafeatures_labels = None # Hard-coded list of too-expensive metafeatures!