def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, random_seed=default_random_seed, debug=False, **kwargs): ctx = multiprocessing.get_context('spawn') if gpus is None: gpus = get_available_gpus_cuda_string() if gpus is not None: num_available_cpus = ctx.cpu_count() if self.num_workers > num_available_cpus: logger.warning( "WARNING: num_workers={}, num_available_cpus={}. " "To avoid bottlenecks setting num workers to be less " "or equal to number of available cpus is suggested".format( self.num_workers, num_available_cpus)) if isinstance(gpus, int): gpus = str(gpus) gpus = gpus.strip() gpu_ids = gpus.split(",") num_gpus = len(gpu_ids) available_gpu_memory_list = get_available_gpu_memory() gpu_ids_meta = {} if num_gpus < self.num_workers: fraction = (num_gpus / self.num_workers) - self.epsilon for gpu_id in gpu_ids: available_gpu_memory = available_gpu_memory_list[int( gpu_id)] required_gpu_memory = fraction * available_gpu_memory if gpu_memory_limit is None: logger.warning( 'WARNING: Setting gpu_memory_limit to {} ' 'as there available gpus are {} ' 'and the num of workers is {} ' 'and the available gpu memory for gpu_id ' '{} is {}'.format(required_gpu_memory, num_gpus, self.num_workers, gpu_id, available_gpu_memory)) new_gpu_memory_limit = required_gpu_memory - \ ( self.TF_REQUIRED_MEMORY_PER_WORKER * self.num_workers) else: new_gpu_memory_limit = gpu_memory_limit if new_gpu_memory_limit > available_gpu_memory: logger.warning( 'WARNING: Setting gpu_memory_limit to available gpu ' 'memory {} minus an epsilon as the value specified is greater than ' 'available gpu memory.'.format( available_gpu_memory)) new_gpu_memory_limit = available_gpu_memory - self.epsilon_memory if required_gpu_memory < new_gpu_memory_limit: if required_gpu_memory > 0.5 * available_gpu_memory: if available_gpu_memory != new_gpu_memory_limit: logger.warning( 'WARNING: Setting gpu_memory_limit to available gpu ' 'memory {} minus an epsilon as the gpus would be underutilized for ' 'the parallel processes otherwise'. format(available_gpu_memory)) new_gpu_memory_limit = available_gpu_memory - self.epsilon_memory else: logger.warning( 'WARNING: Setting gpu_memory_limit to {} ' 'as the available gpus are {} and the num of workers ' 'are {} and the available gpu memory for gpu_id ' '{} is {}'.format(required_gpu_memory, num_gpus, self.num_workers, gpu_id, available_gpu_memory)) new_gpu_memory_limit = required_gpu_memory else: logger.warning( 'WARNING: gpu_memory_limit could be increased to {} ' 'as the available gpus are {} and the num of workers ' 'are {} and the available gpu memory for gpu_id ' '{} is {}'.format(required_gpu_memory, num_gpus, self.num_workers, gpu_id, available_gpu_memory)) process_per_gpu = int(available_gpu_memory / new_gpu_memory_limit) gpu_ids_meta[gpu_id] = { "gpu_memory_limit": new_gpu_memory_limit, "process_per_gpu": process_per_gpu } else: for gpu_id in gpu_ids: gpu_ids_meta[gpu_id] = { "gpu_memory_limit": gpu_memory_limit, "process_per_gpu": 1 } manager = ctx.Manager() self.queue = manager.Queue() for gpu_id in gpu_ids: process_per_gpu = gpu_ids_meta[gpu_id]["process_per_gpu"] gpu_memory_limit = gpu_ids_meta[gpu_id]["gpu_memory_limit"] for _ in range(process_per_gpu): gpu_id_meta = { "gpu_id": gpu_id, "gpu_memory_limit": gpu_memory_limit } self.queue.put(gpu_id_meta) pool = ctx.Pool(self.num_workers, ParallelExecutor.init_worker) try: hyperopt_results = [] trials = 0 while not self.hyperopt_sampler.finished(): sampled_parameters = self.hyperopt_sampler.sample_batch() hyperopt_parameters = [] for i, parameters in enumerate(sampled_parameters): modified_config = substitute_parameters( copy.deepcopy(config), parameters) trial_id = trials + i hyperopt_parameters.append( dict( parameters=parameters, config=modified_config, eval_split=self.split, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=f'{experiment_name}_{trial_id}', model_name=model_name, # model_load_pat=model_load_path, # model_resume_path=model_resume_path, skip_save_training_description= skip_save_training_description, skip_save_training_statistics= skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output= skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, use_horovod=use_horovod, random_seed=random_seed, debug=debug, )) trials += len(sampled_parameters) if gpus is not None: batch_results = pool.map(self._run_experiment_gpu, hyperopt_parameters) else: batch_results = pool.map(self._run_experiment, hyperopt_parameters) self.hyperopt_sampler.update_batch( (result["parameters"], result["metric_score"]) for result in batch_results) hyperopt_results.extend(batch_results) finally: pool.close() pool.join() hyperopt_results = self.sort_hyperopt_results(hyperopt_results) return hyperopt_results
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol( dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config.") if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict.pop(TYPE) search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator( max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`.") else: search_alg = ConcurrencyLimiter( search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment(config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend)) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # we can't set Trial actor's CPUs to 0 so we just go very low resources_per_trial = PlacementGroupFactory( [{ "CPU": 0.001 }] + ([{ "CPU": 1, "GPU": 1 }] * self._gpu_resources_per_trial_non_none) if self. _gpu_resources_per_trial_non_none else [{ "CPU": 0.001 }] + [{ "CPU": 1 }] * self._cpu_resources_per_trial_non_none) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) run_experiment_trial_params = tune.with_parameters( run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable( f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, ) ordered_trials = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) ordered_trials = [ TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials ] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, random_seed=default_random_seed, debug=False, **kwargs): if isinstance(dataset, str) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, use_horovod=use_horovod, random_seed=random_seed, debug=debug, ) if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: mode = "min" if self.goal != MAXIMIZE else "max" search_alg_type = self.search_alg_dict.pop(TYPE) search_alg = tune.create_searcher(search_alg_type, metric="metric_score", mode=mode, **self.search_alg_dict) else: search_alg = None sync_config = None if self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) analysis = tune.run( tune.with_parameters(self._run_experiment, hyperopt_dict=hyperopt_dict), config=self.search_space, search_alg=search_alg, num_samples=self.num_samples, resources_per_trial={ "cpu": self.cpu_resources_per_trial or 1, "gpu": self.gpu_resources_per_trial or 0, }, queue_trials=True, sync_config=sync_config, ) hyperopt_results = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) return hyperopt_results.to_dict(orient="records")
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, backend=None, random_seed=default_random_seed, debug=False, **kwargs): if isinstance(dataset, str) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config.") hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict.pop(TYPE) search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None sync_config = None if self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) resources_per_trial = { "cpu": self.cpu_resources_per_trial or 1, "gpu": self.gpu_resources_per_trial or 0, } def run_experiment_trial(config, checkpoint_dir=None): return self._run_experiment(config, checkpoint_dir, hyperopt_dict, self.decode_ctx) register_trainable(f"trainable_func_f{hash_dict(config)}", run_experiment_trial) analysis = tune.run( f"trainable_func_f{hash_dict(config)}", config=self.search_space, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, resources_per_trial=resources_per_trial, queue_trials=True, sync_config=sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", ) hyperopt_results = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) return hyperopt_results.to_dict(orient="records")