def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric=None, feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None #pdb.set_trace() loaded_data_manager = XYDataManager(X, y, task=task, feat_type=feat_type, dataset_name=dataset_name) #pdb.set_trace() return self._fit(loaded_data_manager, metric)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_numpy_array(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def test_smbo_metalearning_configurations(backend, context, dask_client): # Get the inputs to the optimizer X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') config_space = AutoML(backend=backend, metric=autosklearn.metrics.accuracy, time_left_for_this_task=20, per_run_time_limit=5).fit( X_train, Y_train, task=BINARY_CLASSIFICATION, only_return_configuration_space=True) watcher = StopWatch() # Create an optimizer smbo = AutoMLSMBO( config_space=config_space, dataset_name='iris', backend=backend, total_walltime_limit=10, func_eval_time_limit=5, memory_limit=4096, metric=autosklearn.metrics.accuracy, watcher=watcher, n_jobs=1, dask_client=dask_client, port=logging.handlers.DEFAULT_TCP_LOGGING_PORT, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, pynisher_context=context, ) assert smbo.pynisher_context == context # Create the inputs to metalearning datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=BINARY_CLASSIFICATION, dataset_name='iris', feat_type={i: 'numerical' for i in range(X_train.shape[1])}, ) backend.save_datamanager(datamanager) smbo.task = BINARY_CLASSIFICATION smbo.reset_data_manager() metalearning_configurations = smbo.get_metalearning_suggestions() # We should have 25 metalearning configurations assert len(metalearning_configurations) == 25 assert [ isinstance(config, Configuration) for config in metalearning_configurations ]
def test_do_dummy_prediction(backend, dask_client, datasets): name, task = datasets X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, ) # Make a dummy logger auto._logger_port = 9020 auto._logger = unittest.mock.Mock() auto._logger.info.return_value = None auto._backend.save_datamanager(datamanager) D = backend.load_datamanager() # Check if data manager is correcly loaded assert D.info['task'] == datamanager.info['task'] auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) assert os.path.exists(os.path.join( backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy') ) model_path = os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', '1.1.0.0.model') # Make sure the dummy model complies with scikit learn # get/set params assert os.path.exists(model_path) with open(model_path, 'rb') as model_handler: clone(pickle.load(model_handler)) auto._clean_logger() del auto
def test_do_dummy_prediction(self): datasets = { 'breast_cancer': BINARY_CLASSIFICATION, 'wine': MULTICLASS_CLASSIFICATION, 'diabetes': REGRESSION, } for name, task in datasets.items(): backend_api = self._create_backend('test_do_dummy_prediction') X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend.save_datamanager(datamanager) D = backend_api.load_datamanager() # Check if data manager is correcly loaded self.assertEqual(D.info['task'], datamanager.info['task']) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. self.assertFalse( os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue( os.path.exists( os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_1_0.0.npy'))) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_do_dummy_prediction(dask_client, datasets): name, task = datasets X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type={i: 'numerical' for i in range(X_train.shape[1])}, ) auto = autosklearn.automl.AutoML( 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, delete_tmp_folder_after_terminate=False, ) auto._backend = auto._create_backend() # Make a dummy logger auto._logger_port = 9020 auto._logger = unittest.mock.Mock() auto._logger.info.return_value = None auto._backend.save_datamanager(datamanager) D = auto._backend.load_datamanager() # Check if data manager is correcly loaded assert D.info['task'] == datamanager.info['task'] auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) assert os.path.exists( os.path.join(auto._backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy')) auto._clean_logger() del auto
def get_abalone_datamanager(): # https://www.openml.org/d/183 dataset_name = 'abalone' data = sklearn.datasets.fetch_openml(data_id=183, as_frame=True) feat_type = [ 'Categorical' if x.name == 'category' else 'Numerical' for x in data['data'].dtypes ] X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True) y = preprocessing.LabelEncoder().fit_transform(y) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) D = XYDataManager(X_train, y_train, X_test, y_test, MULTICLASS_CLASSIFICATION, feat_type, dataset_name) return D
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, bool) for f in feat_type]): raise ValueError('Array feat_type must only contain bools.') loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def test_do_dummy_prediction(backend, dask_client, datasets): name, task = datasets X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, ) setup_logger(backend.temporary_directory) auto._logger = get_logger('test_do_dummy_predictions') auto._backend.save_datamanager(datamanager) D = backend.load_datamanager() # Check if data manager is correcly loaded assert D.info['task'] == datamanager.info['task'] auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) assert os.path.exists( os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy')) del auto
def set_dataset(self, X, y, X_test=None, y_test=None, feat_type=None): """ Stores the obtained dataset parameters in the XYDataManager of auto-sklearn and caclulates the metafeatures of the dataset """ utils = MetaFeatures() X, y = utils.perform_input_checks(X, y) if X_test is not None: X_test, y_test = utils.perform_input_checks(X_test, y_test) if len(y.shape) != len(y_test.shape): raise ValueError('Target value shapes do not match: %s vs %s' % (y.shape, y_test.shape)) if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self.target_type = type_of_target(y) task = self._task_mapping.get(self.target_type) if task == None: task = ask_const.REGRESSION self.dataset_name = FMLHash().hashValAndReturnString(str(X)) self.data_manager = XYDataManager(X, y, X_test, y_test, task, feat_type, self.dataset_name) self.meta_features = utils.calculate_metafeatures( self.data_manager, self.dataset_name)
def test_fail_if_dummy_prediction_fails(self, ta_run_mock): backend_api = self._create_backend('test_fail_if_dummy_prediction_fails') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=2, feat_type=['Numerical' for i in range(X_train.shape[1])], dataset_name='iris', ) time_for_this_task = 30 per_run_time = 10 auto = autosklearn.automl.AutoML(backend_api, time_for_this_task, per_run_time, initial_configurations_via_metalearning=25, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_fail_if_dummy_prediction_fails') auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) # First of all, check that ta.run() is actually called. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" auto._do_dummy_prediction(datamanager, 1) ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task) # Case 1. Check that function raises no error when statustype == success. # ta.run() returns status, cost, runtime, and additional info. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" raised = False try: auto._do_dummy_prediction(datamanager, 1) except ValueError: raised = True self.assertFalse(raised, 'Exception raised') # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.CRASHED ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.ABORT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.ABORT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.TIMEOUT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.MEMOUT ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) ta_run_mock.return_value = StatusType.CAPPED, None, None, "test" self.assertRaisesRegex(ValueError, 'Dummy prediction failed with run state StatusType.CAPPED ' 'and additional output: test.', auto._do_dummy_prediction, datamanager, 1, ) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def fit( self, X: np.ndarray, y: np.ndarray, task: int, metric: Scorer, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[bool]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, incremental_learning: bool = False, ): if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) return self._fit( datamanager=loaded_data_manager, metric=metric, load_models=load_models, only_return_configuration_space=only_return_configuration_space, incremental_learning=incremental_learning)
def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=2, feat_type=['Numerical' for i in range(X_train.shape[1])], dataset_name='iris', ) time_for_this_task = 30 per_run_time = 10 auto = autosklearn.automl.AutoML(backend, time_for_this_task, per_run_time, initial_configurations_via_metalearning=25, metric=accuracy, dask_client=dask_client, ) auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) # Make a dummy logger auto._logger_port = 9020 auto._logger = unittest.mock.Mock() auto._logger.info.return_value = None # First of all, check that ta.run() is actually called. ta_run_mock.return_value = StatusType.SUCCESS, None, None, {} auto._do_dummy_prediction(datamanager, 1) ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task) # Case 1. Check that function raises no error when statustype == success. # ta.run() returns status, cost, runtime, and additional info. ta_run_mock.return_value = StatusType.SUCCESS, None, None, {} raised = False try: auto._do_dummy_prediction(datamanager, 1) except ValueError: raised = True assert not raised, 'Exception raised' # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.' # noqa ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.ABORT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.ABORT ' 'and additional output: {}.', ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.TIMEOUT ' 'and additional output: {}.' ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.MEMOUT, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.MEMOUT ' 'and additional output: {}.', ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.CAPPED, None, None, {} with pytest.raises( ValueError, match='Dummy prediction failed with run state StatusType.CAPPED ' 'and additional output: {}.' ): auto._do_dummy_prediction(datamanager, 1) ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6} with pytest.raises( ValueError, match='The error suggests that the provided memory limits were too tight.', ): auto._do_dummy_prediction(datamanager, 1)
def fit( self, X: np.ndarray, y: np.ndarray, task: int, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[str]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, ): # Reset learnt stuff self.models_ = None self.cv_models_ = None self.ensemble_ = None # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call if self._metric is None: raise ValueError('No metric given.') if not isinstance(self._metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) datamanager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) self._backend._make_internals_directory() try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise try: os.makedirs(self._backend.get_cv_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 # if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) if only_return_configuration_space: return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_ensembles = max(0, self._time_for_task - elapsed_time) if time_left_for_ensembles <= 0: self._proc_ensemble = None # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") elif self._ensemble_size <= 0: self._proc_ensemble = None self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') else: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_smac = max(0, self._time_for_task - elapsed_time) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None self._budget_type = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: self._logger.warning( 'Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_smac // per_run_time_limit if num_models < 2: per_run_time_limit = time_left_for_smac // 2 self._logger.warning( "Capping the per_run_time_limit to {} to have " "time for a least 2 models in each process.".format( per_run_time_limit)) _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) try: self.runhistory_, self.trajectory_, self._budget_type = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run( self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) except Exception as e: self._logger.exception(e) raise # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None if load_models: self._load_models() return self