def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric=None, feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None #pdb.set_trace() loaded_data_manager = XYDataManager(X, y, task=task, feat_type=feat_type, dataset_name=dataset_name) #pdb.set_trace() return self._fit(loaded_data_manager, metric)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_numpy_array(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def fit_on_datamanager(self, datamanager, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(datamanager.name) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._fit(datamanager, metric)
def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager)
def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start()
def test_stopwatch_overhead(self): # CPU overhead start = time.clock() watch = StopWatch() for i in range(1, 100000): watch.start_task('task_%d' % i) watch.stop_task('task_%d' % i) stop = time.clock() dur = stop - start cpu_overhead = dur - watch.cpu_sum() self.assertLess(cpu_overhead, 1.5) # Wall Overhead start = time.time() watch = StopWatch() for i in range(1, 100000): watch.start_task('task_%d' % i) watch.stop_task('task_%d' % i) stop = time.time() dur = stop - start wall_overhead = dur - watch.wall_sum() self.assertLess(wall_overhead, 2) self.assertLess(cpu_overhead, 2 * wall_overhead)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def test_stopwatch_overhead(self): # Wall Overhead start = time.time() cpu_start = time.process_time() watch = StopWatch() for i in range(1, 1000): watch.start_task('task_%d' % i) watch.stop_task('task_%d' % i) cpu_stop = time.process_time() stop = time.time() dur = stop - start cpu_dur = cpu_stop - cpu_start cpu_overhead = cpu_dur - watch.cpu_sum() wall_overhead = dur - watch.wall_sum() self.assertLess(cpu_overhead, 1) self.assertLess(wall_overhead, 1) self.assertLess(watch.cpu_sum(), 2 * watch.wall_sum())
def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager, metric)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, bool) for f in feat_type]): raise ValueError('Array feat_type must only contain bools.') loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager)
def __init__( self, backend, time_left_for_this_task, per_run_time_limit, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3072, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, shared_mode=False, precision=32, disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, ): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._exclude_estimators = exclude_estimators self._include_preprocessors = include_preprocessors self._exclude_preprocessors = exclude_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} self._shared_mode = shared_mode self.precision = precision self._disable_evaluator_output = disable_evaluator_output self._get_smac_object_callback = get_smac_object_callback self._smac_scenario_args = smac_scenario_args self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit)))
def main(self): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 last_hash = None current_hash = None backend = Backend(self.output_dir, self.autosklearn_tmp_dir) dir_ensemble = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] self.logger.debug( 'Starting main loop with %f seconds and %d iterations ' 'left.' % (self.limit - used_time, num_iteration)) while used_time < self.limit or (self.max_iterations > 0 and self.max_iterations >= num_iteration): num_iteration += 1 self.logger.debug('Time left: %f', self.limit - used_time) self.logger.debug('Time last ensemble building: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): self.logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if self.shared_mode is False: dir_ensemble_list = sorted( glob.glob( os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed))) if exists[1]: dir_valid_list = sorted( glob.glob( os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % self.seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted( glob.glob( os.path.join( dir_test, 'predictions_test_%s_*.npy' % self.seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted( os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted( os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: self.logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: self.logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('index_run' + str(index_run)) watch.start_task('ensemble_iter_' + str(num_iteration)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if self.ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) try: if self.precision is "16": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif self.precision is "32": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif self.precision is "64": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load( os.path.join(dir_ensemble, basename)) score = calculate_score(targets_ensemble, predictions, self.task_type, self.metric, predictions.shape[1]) except Exception as e: self.logger.warning('Error loading %s: %s', basename, e) score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if self.ensemble_nbest is not None: if score <= 0.001: self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < self.ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: self.logger.debug( 'Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ self.get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, self.precision) except IOError: self.logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: self.logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) self.logger.info(ensemble) except ValueError as e: self.logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: self.logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: self.logger.error('Caught error! %s', str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score self.logger.info('Training performance: %f' % ensemble.train_score_) self.logger.info( 'Building the ensemble took %f seconds' % watch.wall_elapsed('ensemble_iter_' + str(num_iteration))) # Set this variable here to avoid re-running the ensemble builder # every two seconds in case the ensemble did not change current_num_models = len(dir_ensemble_list) ensemble_predictions = ensemble.predict(all_predictions_train) if sys.version_info[0] == 2: ensemble_predictions.flags.writeable = False current_hash = hash(ensemble_predictions.data) else: current_hash = hash(ensemble_predictions.data.tobytes()) # Only output a new ensemble and new predictions if the output of the # ensemble would actually change! # TODO this is neither safe (collisions, tests only with the ensemble # prediction, but not the ensemble), implement a hash function for # each possible ensemble builder. if last_hash is not None: if current_hash == last_hash: self.logger.info('Ensemble output did not change.') time.sleep(2) continue else: last_hash = current_hash else: last_hash = current_hash # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, self.seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict( all_predictions_valid) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_valid = ensemble_predictions_valid[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_valid[ ensemble_predictions_valid < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_valid.shape[1] == 1): bin_array[ensemble_predictions_valid >= 0.5] = 1 else: sample_num = ensemble_predictions_valid.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_valid[i, :]) bin_array[i, j] = 1 ensemble_predictions_valid = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_valid.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_valid.size > 1000000: precision = 4 else: # File size maximally 2.1MB precision = 6 backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict( all_predictions_test) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_test = ensemble_predictions_test[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_test[ ensemble_predictions_test < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_test.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_test.shape[1] == 1): bin_array[ensemble_predictions_test >= 0.5] = 1 else: sample_num = ensemble_predictions_test.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_test[i, :]) bin_array[i, j] = 1 ensemble_predictions_test = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_test.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_test.size > 1000000: precision = 4 else: precision = 6 backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('index_run' + str(index_run)) time_iter = watch.get_wall_dur('index_run' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir)
def fit( self, X: np.ndarray, y: np.ndarray, task: int, metric: Scorer, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[bool]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, incremental_learning: bool = False, ): if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) return self._fit( datamanager=loaded_data_manager, metric=metric, load_models=load_models, only_return_configuration_space=only_return_configuration_space, incremental_learning=incremental_learning)
def main(autosklearn_tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32"): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 backend = Backend(output_dir, autosklearn_tmp_dir) dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration): num_iteration += 1 logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if shared_mode is False: dir_ensemble_list = sorted(glob.glob(os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed))) if exists[1]: dir_valid_list = sorted(glob.glob(os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted(glob.glob(os.path.join( dir_test, 'predictions_test_%s_*.npy' % seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) if precision is "16": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif precision is "32": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif precision is "64": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, basename)) try: score = calculate_score(targets_ensemble, predictions, task_type, metric, predictions.shape[1]) except: score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if ensemble_nbest is not None: if score <= 0.001: logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, precision) except IOError: logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=ensemble_size, task_type=task_type, metric=metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) logger.info(ensemble) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score logger.info('Training performance: %f' % ensemble.train_score_) # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict(all_predictions_valid) backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=dataset_name) else: logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict(all_predictions_test) backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=dataset_name) else: logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir)