def __getstate__(self): """Memento generator for Trial. Sets RUNNING trials to PENDING, and flushes the result logger. Note this can only occur if the trial holds a PERSISTENT checkpoint. """ assert self.checkpoint.storage == Checkpoint.PERSISTENT, ( "Checkpoint must not be in-memory.") state = self.__dict__.copy() state["resources"] = resources_to_json(self.resources) for key in self._nonjson_fields: state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["result_logger"] = None # Avoid waiting for events that will never occur on resume. state["resuming_from"] = None state["saving_to"] = None if self.result_logger: self.result_logger.flush(sync_down=False) state["__logger_started__"] = True else: state["__logger_started__"] = False return copy.deepcopy(state)
def get_experiments(run_args, arg_parser: ArgumentParser = None): if run_args.config_file: with open(run_args.config_file) as f: experiments = yaml.safe_load(f) else: experiments = { run_args.experiment_name: { # i.e. log to ~/ray_results/default "run": run_args.run, "checkpoint_freq": run_args.checkpoint_freq, "keep_checkpoints_num": run_args.keep_checkpoints_num, "checkpoint_score_attr": run_args.checkpoint_score_attr, "local_dir": run_args.local_dir, "resources_per_trial": ( run_args.resources_per_trial and resources_to_json(run_args.resources_per_trial)), "stop": run_args.stop, "config": dict(run_args.config, env=run_args.env), "restore": run_args.restore, "num_samples": run_args.num_samples, "upload_dir": run_args.upload_dir, } } if arg_parser is not None: for exp in experiments.values(): if not exp.get("run"): arg_parser.error("the following arguments are required: --run") if not exp.get("envs") and not exp.get("config", {}).get("envs"): arg_parser.error("the following arguments are required: --envs") return experiments
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) else: ray.init(address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) run_experiments(experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)
def __getstate__(self): """Memento generator for Trial. Sets RUNNING trials to PENDING. Note this can only occur if the trial holds a PERSISTENT checkpoint. """ assert self.checkpoint.storage == Checkpoint.PERSISTENT, ( "Checkpoint must not be in-memory.") state = self.__dict__.copy() state["resources"] = resources_to_json(self.resources) for key in self._nonjson_fields: state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["location"] = Location() # Avoid waiting for events that will never occur on resume. state["resuming_from"] = None state["saving_to"] = None return copy.deepcopy(state)
def __getstate__(self): """Memento generator for Trial. Sets RUNNING trials to PENDING, and flushes the result logger. Note this can only occur if the trial holds a DISK checkpoint. """ assert self._checkpoint.storage == Checkpoint.DISK, ( "Checkpoint must not be in-memory.") state = self.__dict__.copy() state["resources"] = resources_to_json(self.resources) for key in self._nonjson_fields: state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["result_logger"] = None if self.result_logger: self.result_logger.flush() state["__logger_started__"] = True else: state["__logger_started__"] = False return copy.deepcopy(state)
def __getstate__(self): """Memento generator for Trial. Sets RUNNING trials to PENDING. Note this can only occur if the trial holds a PERSISTENT checkpoint. """ state = self.__dict__.copy() state["resources"] = resources_to_json(self.resources) for key in self._nonjson_fields: state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["location"] = Location() # Avoid waiting for events that will never occur on resume. state["restoring_from"] = None state["saving_to"] = None state["_state_json"] = None state["_state_valid"] = False state["_default_result_or_future"] = None return copy.deepcopy(state)
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.torch: exp["config"]["use_pytorch"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 2 if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True ### Add Custom Callbacks exp["config"]["callbacks"] = CustomCallbacks if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) else: ray.init(address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) # NOTE: customs for exp in experiments.values(): exp["loggers"] = make_loggers(args) # launch training run_experiments(experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, verbose=verbose, concurrent=True)
def testSerialization(self): original = Resources(1, 0, 0, 1, custom_resources={"a": 1, "b": 2}) jsoned = resources_to_json(original) new_resource = json_to_resources(jsoned) self.assertEqual(original, new_resource)
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial) ), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "sync_config": { "upload_dir": args.upload_dir, }, } } # Ray UI. if args.no_ray_ui: deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False) args.ray_ui = False verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. input_ = exp.get("config", {}).get("input") if input_ and input_ != "sampler": # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent def patch_path(path): if isinstance(path, list): return [patch_path(i) for i in path] elif isinstance(path, dict): return { patch_path(k): patch_path(v) for k, v in path.items() } elif isinstance(path, str): if os.path.exists(path): return path else: abs_path = str(rllib_dir.absolute().joinpath(path)) return abs_path if os.path.exists(abs_path) else path else: return path exp["config"]["input"] = patch_path(input_) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.torch: deprecation_warning("--torch", "--framework=torch") exp["config"]["framework"] = "torch" elif args.eager: deprecation_warning("--eager", "--framework=[tf2|tfe]") exp["config"]["framework"] = "tfe" elif args.framework is not None: exp["config"]["framework"] = args.framework if args.trace: if exp["config"]["framework"] not in ["tf2", "tfe"]: raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 3 # Print details on trial result if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 # Print details on trial result if args.ray_num_nodes: # Import this only here so that train.py also works with # older versions (and user doesn't use `--ray-num-nodes`). from ray.cluster_utils import Cluster cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, ) ray.init(address=cluster.address) else: ray.init( include_dashboard=args.ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode, ) if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1) else: progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1) run_experiments( experiments, scheduler=create_scheduler(args.scheduler, **args.scheduler_config), resume=args.resume, verbose=verbose, progress_reporter=progress_reporter, concurrent=True, ) ray.shutdown()
def get_experiment_definition(self): if self.args.config_file: with open(self.args.config_file) as f: experiments = yaml.safe_load(f) exp_name_list = list(experiments.keys()) assert len(exp_name_list) == 1 # overwirte experiment name for SageMaker to recognize experiments['training'] = experiments.pop(exp_name_list[0]) else: experiments = { self.args.experiment_name: { # i.e. log to ~/ray_results/default "run": self.args.run, "checkpoint_freq": self.args.checkpoint_freq, "keep_checkpoints_num": self.args.keep_checkpoints_num, "checkpoint_score_attr": self.args.checkpoint_score_attr, "local_dir": self.args.local_dir, "resources_per_trial": ( self.args.resources_per_trial and resources_to_json(self.args.resources_per_trial)), "stop": self.args.stop, "config": dict(self.args.config, env=self.args.env), "restore": self.args.restore, "num_samples": self.args.num_samples, "upload_dir": self.args.upload_dir, } } verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath( exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): raise ValueError("The following arguments are required: run") if not exp.get("env") and not exp.get("config", {}).get("env"): raise ValueError("The following arguments are required: env") if self.args.eager: exp["config"]["eager"] = True if self.args.torch: exp["config"]["use_pytorch"] = True if self.args.v: exp["config"]["log_level"] = "INFO" verbose = 2 if self.args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 if self.args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True ### Add Custom Callbacks exp["config"]["callbacks"] = CustomCallbacks return experiments, self.args, verbose
def run(args, parser): if args.config_file: with open(args.config_file) as f: config_experiments = yaml.safe_load(f) experiments = config_experiments else: if args.algo is not None: args.experiment = args.algo if args.experiment: config_file = os.path.join('config', f'{args.experiment}.yaml') with open(config_file) as f: config_dict = yaml.safe_load(f) else: config_dict = {args.name: {}} if args.debug: args.env = 'MineRLRandomDebug-v0' experiments = {} for experiment_name, experiment_settings in config_dict.items(): config = dict(args.config, env=args.env) # TODO: implement if args.mode == 'offline': config.update( dict( explore=False, input=args.data_path, input_evaluation=['simulation'], )) elif args.mode == 'mixed': config.update( dict( input={ args.data_path: args.mixing_ratio, 'sample': (1 - args.mixing_ratio) }, input_evaluation=['simulation'], )) if 'time_total_s' not in args.stop: # The MineRL competition training time limit is 4 days. Subtract an hour for evaluation. args.stop['time_total_s'] = int(2 * 24 * 60 * 60 - 3600) # limit two day training if 'info/num_steps_sampled' not in args.stop: # The MineRL competition environment sample limit is 8 million steps. args.stop['info/num_steps_sampled'] = 8000000 if args.checkpoint_freq is None: args.checkpoint_freq = 1000 if args.checkpoint_at_end is None: args.checkpoint_at_end = True if args.checkpoint_score_attr is None: args.checkpoint_score_attr = 'episode_reward_mean' # Note: keep this in sync with tune/config_parser.py settings_from_args = { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": config, "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } # overwrite the settings from arguments with those in the experiment config file settings = merge_dicts(settings_from_args, experiment_settings) experiments.update({experiment_name: settings}) if any('MineRL' in setting['config']['env'] for setting in experiments.values()): import envs envs.register(discrete=args.discrete, num_actions=args.num_actions, data_dir=args.data_dir) print('\nArguments:') pprint.pprint(args) print('\nExperiment config:') pprint.pprint(experiments) print() verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if 'framework' not in exp['config']: if args.eager: exp["config"]["framework"] = "tfe" elif args.torch: exp["config"]["framework"] = "torch" else: exp["config"]["framework"] = "tf" if args.v: exp["config"]["log_level"] = "INFO" verbose = 2 if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 if args.trace: if exp["config"]["framework"] != "tfe": raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) else: ray.init(include_dashboard=not args.no_ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode) run_experiments(experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, verbose=verbose, concurrent=True) ray.shutdown()
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "env": "HuskyPickAndPlace-v1", "checkpoint_freq": 100, # args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), # "stop": args.stop, "stop": {"timesteps_total": 4000000}, # 10M "episode_reward_mean": 18.0 # "config": {dict(args.config, env=args.env)}, "config": { "num_workers": 10, "ignore_worker_failures": True, # "seed": 789, "callbacks": { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, }, }, "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } # verbose = 1 # for exp in experiments.values(): # if not exp.get("run"): # parser.error("the following arguments are required: --run") # if not exp.get("env") and not exp.get("config", {}).get("env"): # parser.error("the following arguments are required: --env") # if args.eager: # exp["config"]["eager"] = True # if args.v: # exp["config"]["log_level"] = "INFO" # verbose = 2 # if args.vv: # exp["config"]["log_level"] = "DEBUG" # verbose = 3 # if args.trace: # if not exp["config"].get("eager"): # raise ValueError("Must enable --eager to enable tracing.") # exp["config"]["eager_tracing"] = True # if args.ray_num_nodes: # cluster = Cluster() # for _ in range(args.ray_num_nodes): # cluster.add_node( # num_cpus=args.ray_num_cpus or 1, # num_gpus=args.ray_num_gpus or 0, # object_store_memory=args.ray_object_store_memory, # memory=args.ray_memory, # redis_max_memory=args.ray_redis_max_memory) # ray.init(address=cluster.address) # else: ray.init(address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, verbose=2, )
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.torch: exp["config"]["framework"] = "torch" elif args.eager: exp["config"]["framework"] = "tfe" if args.trace: if exp["config"]["framework"] not in ["tf2", "tfe"]: raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 3 # Print details on trial result if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 # Print details on trial result if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory) ray.init(address=cluster.address) else: ray.init(include_dashboard=not args.no_ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode) if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1) else: progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1) run_experiments(experiments, scheduler=create_scheduler(args.scheduler, **args.scheduler_config), resume=args.resume, queue_trials=args.queue_trials, verbose=verbose, progress_reporter=progress_reporter, concurrent=True) ray.shutdown()
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } verbose = 1 webui_host = "localhost" for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input"): if not isinstance(exp.get("config", {}).get("input"),dict): if not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.torch: exp["config"]["use_pytorch"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 2 if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.bind_all: webui_host = "0.0.0.0" if args.log_flatland_stats: exp['config']['callbacks'] = { 'on_episode_end': on_episode_end, } if args.eval: eval_configs = get_eval_config(exp['config'].get('env_config',\ {}).get('eval_generator',"default")) eval_seed = eval_configs.get('evaluation_config',{}).get('env_config',{}).get('seed') # add evaluation config to the current config exp['config'] = merge_dicts(exp['config'],eval_configs) if exp['config'].get('evaluation_config'): exp['config']['evaluation_config']['env_config'] = exp['config'].get('env_config') eval_env_config = exp['config']['evaluation_config'].get('env_config') if eval_seed and eval_env_config: # We override the env seed from the evaluation config eval_env_config['seed'] = eval_seed # Remove any wandb related configs if eval_env_config: if eval_env_config.get('wandb'): del eval_env_config['wandb'] # Remove any wandb related configs if exp['config']['evaluation_config'].get('wandb'): del exp['config']['evaluation_config']['wandb'] if args.config_file: # TODO should be in exp['config'] directly exp['config']['env_config']['yaml_config'] = args.config_file exp['loggers'] = [WandbLogger, TBXLogger] if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) else: ray.init( address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, webui_host=webui_host) run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, verbose=verbose, concurrent=True)
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) # add callbacks for self-defined metric # and save successful transitions from RL agents experiment_name = next(iter(experiments)) experiments[experiment_name]["config"]["optimizer"]["robot_demo_path"] = dir_path experiments[experiment_name]["config"]["callbacks"] = { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, "on_sample_end": on_sample_end, "on_train_result": on_train_result, "on_postprocess_traj": on_postprocess_traj } else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) #, log_to_driver=False) else: ray.init( address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) # log_to_driver=False) # disable the loggings # https://github.com/ray-project/ray/issues/5048 run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)