def run_experiment(config, trainable): """ Run a single tune experiment in parallel as a "remote" function. :param config: The experiment configuration :type config: dict :param trainable: tune.Trainable class with your experiment :type trainable: :class:`ray.tune.Trainable` """ # Stop criteria. Default to total number of iterations/epochs stop_criteria = { "training_iteration": config.get("iterations") } stop_criteria.update(config.get("stop", {})) tune.run( trainable, name=config["name"], local_dir=config["path"], stop=stop_criteria, config=config, num_samples=config.get("repetitions", 1), search_alg=config.get("search_alg", None), scheduler=config.get("scheduler", AsyncHyperBandScheduler( reward_attr='mean_accuracy', time_attr="training_iteration", brackets = 2, grace_period=max(1, int(config.get("iterations", 10)/10)), reduction_factor=3, max_t=config.get("iterations", 10) )), trial_name_creator=tune.function(trial_name_string), trial_executor=config.get("trial_executor", None), checkpoint_at_end=config.get("checkpoint_at_end", False), checkpoint_freq=config.get("checkpoint_freq", 0), upload_dir=config.get("upload_dir", None), sync_function=config.get("sync_function", None), resume=config.get("resume", False), reuse_actors=config.get("reuse_actors", False), verbose=config.get("verbose", 0), resources_per_trial={ "cpu": config.get("cpu_percentage", 1.0), "gpu": config.get("gpu_percentage", 1.0), }, # # added parameters to allow monitoring through REST API # with_server=True, # server_port=4321, )
def run_experiment(config, trainable): """ Run a single tune experiment in parallel as a "remote" function. :param config: The experiment configuration :type config: dict :param trainable: tune.Trainable class with your experiment :type trainable: :class:`ray.tune.Trainable` """ # Stop criteria. Default to total number of iterations/epochs stop_criteria = { "training_iteration": config.get("iterations") } stop_criteria.update(config.get("stop", {})) tune.run( trainable, name=config["name"], local_dir=config["path"], stop=stop_criteria, config=config, num_samples=config.get("repetitions", 1), search_alg=config.get("search_alg", None), scheduler=config.get("scheduler", MedianStoppingRule( time_attr="training_iteration", reward_attr='noise_accuracy', min_samples_required=3, grace_period=20, verbose=False, )), trial_name_creator=tune.function(trial_name_string), trial_executor=config.get("trial_executor", None), checkpoint_at_end=config.get("checkpoint_at_end", False), checkpoint_freq=config.get("checkpoint_freq", 0), upload_dir=config.get("upload_dir", None), sync_function=config.get("sync_function", None), resume=config.get("resume", False), reuse_actors=config.get("reuse_actors", False), verbose=config.get("verbose", 0), resources_per_trial={ # With lots of trials, optimal seems to be 0.5, or 2 trials per GPU # If num trials <= num GPUs, 1.0 is better "cpu": 1, "gpu": config.get("gpu_percentage", 0.5), } )
def testGetTrialsWithFunction(self): runner, client = self.basicSetup() test_trial = Trial( "__fake", trial_id="function_trial", stopping_criterion={"training_iteration": 3}, config={ "callbacks": { "on_episode_start": tune.function(lambda x: None) } }) runner.add_trial(test_trial) for i in range(3): runner.step() all_trials = client.get_all_trials()["trials"] self.assertEqual(len(all_trials), 3) client.get_trial("function_trial") runner.step() self.assertEqual(len(all_trials), 3)
path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"timestep": self.timestep})) return path def _restore(self, checkpoint_path): with open(checkpoint_path) as f: self.timestep = json.loads(f.read())["timestep"] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() ray.init() exp = Experiment( name="hyperband_test", run=MyTrainableClass, num_samples=1, trial_name_creator=tune.function(trial_str_creator), loggers=[TestLogger], stop={"training_iteration": 1 if args.smoke_test else 99999}, config={ "width": tune.sample_from( lambda spec: 10 + int(90 * random.random())), "height": tune.sample_from(lambda spec: int(100 * random.random())) }) trials = run_experiments(exp)
"num_gpus": 0.2 if args.gpu else 0, "num_workers": args.num_workers, "sgd_minibatch_size": 100 if args.fast else 1000, "sample_batch_size": 200 if args.fast else 5000, "train_batch_size": 1000 if args.fast else 15000, "batch_mode": "complete_episodes", "observation_filter": "NoFilter", "num_envs_per_worker": 8, "model": { "custom_model": "mask", "fcnet_hiddens": [512, 512], }, "vf_share_layers": True, "entropy_coeff": 0.01, "callbacks": { "on_episode_end": tune.function(on_episode_end), }, "env_config": { "zero_obs": False, "dump_dir": args.dump_dir, "partition_mode": args.partition_mode, "reward_shape": args.reward_shape, "max_depth": 100 if args.fast else 500, "max_actions": 1000 if args.fast else 15000, "depth_weight": args.depth_weight, "rules": grid_search(args.rules), }, }, }, })
def _save(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"timestep": self.timestep})) return path def _restore(self, checkpoint_path): with open(checkpoint_path) as f: self.timestep = json.loads(f.read())["timestep"] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() trials = run( MyTrainableClass, name="hyperband_test", num_samples=5, trial_name_creator=tune.function(trial_str_creator), loggers=[TestLogger], stop={"training_iteration": 1 if args.smoke_test else 99999}, config={ "width": tune.sample_from(lambda spec: 10 + int(90 * random.random())), "height": tune.sample_from(lambda spec: int(100 * random.random())) })
eval_config = { "evaluation_interval": 1, # I think this means every x training_iterations "evaluation_config": { "explore": False, "exploration_fraction": 0, "exploration_final_eps": 0, "evaluation_num_episodes": 10, "horizon": 100, "env_config": { "dummy_eval": True, # hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats "transition_noise": 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)), "reward_noise": tune.function(lambda a: a.normal(0, 0)), "action_loss_weight": 0.0, }, }, } value_tuples = [] for config_type, config_dict in var_configs.items(): for key in config_dict: assert ( isinstance(var_configs[config_type][key], list) ), "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values" value_tuples.append(var_configs[config_type][key]) cartesian_product_configs = list(itertools.product(*value_tuples))
}, "model": { "fcnet_hiddens": [layer_width for i in range(num_layers)], # "custom_preprocessor": "ohe", "custom_options": {}, # extra options to pass to your preprocessor "fcnet_activation": "tanh", "use_lstm": False, "max_seq_len": 20, "lstm_cell_size": 256, "lstm_use_prev_action_reward": False, }, "callbacks": { # "on_episode_start": tune.function(on_episode_start), # "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), # "on_sample_end": tune.function(on_sample_end), "on_train_result": tune.function(on_train_result), # "on_postprocess_traj": tune.function(on_postprocess_traj), }, "evaluation_interval": 1, # I think this every x training_iterations "evaluation_config": { #'seed': 0, #seed "exploration_fraction": 0, "exploration_final_eps": 0, "batch_mode": "complete_episodes", 'horizon': 100, "env_config": { "dummy_eval": True, #hack } },
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None, flags=None): from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class import torch horizon = flow_params['env'].horizon if flags.algorithm.lower() == "ppo": alg_run = "PPO" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus config["horizon"] = horizon config['sgd_minibatch_size'] = 64 config["clip_param"] = 0.2 #Exploration config['exploration_config']["type"] = "GaussianNoise" config['exploration_config']["initial_scale"] = 1.0 config['exploration_config']["final_scale"] = 0.02 config['exploration_config']["scale_timesteps"] = 1000000 config['exploration_config']["random_timesteps"] = 1000 config['exploration_config']["stddev"] = 0.1 #common config config['framework'] = 'torch' config['callbacks'] = { "on_episode_end": None, "on_episode_start": None, "on_episode_step": None, "on_postprocess_traj": None, "on_sample_end": None, "on_train_result": None } # config["opt_type"]= "adam" for impala and APPO, default is SGD # TrainOneStep class call SGD -->execution_plan function can have policy update function print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") print("running algorithm: ", alg_run) # "Framework: ", "torch" # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
}, "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = { "policy_{}".format(i): gen_policy(i) for i in range(args.num_policies) } policy_ids = list(policy_graphs.keys()) run_experiments({ "test": { "run": "PPO", "env": "multi_cartpole", "stop": { "training_iteration": args.num_iters }, "config": { "simple_optimizer": True, "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(lambda agent_id: random.choice(policy_ids)), }, }, } })
if iter % 500 == 0: trainer.save("saved_models/multi-carla/" + args.model_arch) pprint(results) else: config = { "env": "dm-" + env_name, "log_level": "DEBUG", "multiagent": { "policy_graphs": { "def_policy": (VTracePolicyGraph, Box(0.0, 255.0, shape=(84, 84, 3)), Discrete(9), { "gamma": 0.99 }) }, "policy_mapping_fn": tune.function(lambda agent_id: "def_policy"), }, "env_config": env_actor_configs, "num_workers": args.num_workers, "num_envs_per_worker": args.envs_per_worker, "sample_batch_size": args.sample_bs_per_worker, "train_batch_size": args.train_bs } experiment_spec = tune.Experiment( "multi-carla/" + args.model_arch, "IMPALA", # timesteps_total is init with None (not 0) which causes issue # stop={"timesteps_total": args.num_steps}, stop={"timesteps_since_restore": args.num_steps}, config=config,
def main(): # for the user defined module in code_dir, need to be imported in functions # sys.path.insert(0, code_dir) # import parameters # import basic_src.io_function as io_function # import workflow.whole_procedure as whole_procedure # from utility.eva_report_to_tables import read_accuracy_multi_reports loc_dir = "./ray_results" # tune_name = "tune_traning_para_tesia" # tune_name = "tune_backbone_para_tesia" # tune_name = "tune_backbone_largeBatchS_tesia" tune_name = "tune_backbone_para_tesia_v2" file_folders = io_function.get_file_list_by_pattern( os.path.join(loc_dir, tune_name), '*') # if len(file_folders) > 1: # b_resume = True # else: # b_resume = False # try to resume after when through all (some failed), they always complain: # "Trials did not complete", incomplete_trials, so don't resume. file_folders = io_function.get_file_list_by_pattern( os.path.join(loc_dir, tune_name), '*') if len(file_folders) > 1: b_resume = True else: b_resume = False # max_failures = 2, # stop = tune.function(stop_function), analysis = tune.run( training_function, # set gpu as 2 (can divide batch size), cpu 24, making it one run one trial each time. resources_per_trial={ "gpu": 2, "cpu": 24 }, # use three GPUs, 12 CPUs on tesia # "cpu": 14, don't limit cpu, eval.py will not use all local_dir=loc_dir, name=tune_name, # fail_fast=True, # Stopping after the first failure log_to_file=("stdout.log", "stderr.log"), #Redirecting stdout and stderr to files trial_name_creator=tune.function(trial_name_string), trial_dirname_creator=tune.function(trial_dir_string), resume=b_resume, config={ "lr": tune.grid_search([0.007, 0.014, 0.021, 0.28]), # ,0.007, 0.014, 0.028,0.056 "iter_num": tune.grid_search([30000]), # , 60000,90000, "batch_size": tune.grid_search([8, 16, 32, 48, 96]), # 8,16,32 16, 32, 64, 128 "backbone": tune.grid_search(backbones), "buffer_size": tune.grid_search([300]), # 600 "training_data_per": tune.grid_search([0.9]), #, 0.8 "data_augmentation": tune.grid_search(['blur,crop,bright,contrast,noise']), 'data_aug_ignore_classes': tune.grid_search(['class_0']) } # config={ # "lr": tune.grid_search([0.014]), # ,0.007, 0.014, 0.028,0.056 # "iter_num": tune.grid_search([30000]), # , 60000,90000 # "batch_size": tune.grid_search([8]), # 16, 32, 64, 128 # "backbone": tune.grid_search(backbones), # "buffer_size": tune.grid_search([300]), # "training_data_per": tune.grid_search([0.9]), # "data_augmentation": tune.grid_search(['scale, bright, contrast, noise']), # 'data_aug_ignore_classes':tune.grid_search(['class_0','']) # } ) print("Best config: ", analysis.get_best_config(metric="overall_miou", mode="max")) # Get a dataframe for analyzing trial results. df = analysis.results_df output_file = 'training_miou_ray_tune_%s.xlsx' % ( datetime.now().strftime("%Y%m%d_%H%M%S")) with pd.ExcelWriter(output_file) as writer: df.to_excel(writer) # , sheet_name='accuracy table' # set format # workbook = writer.book # format = workbook.add_format({'num_format': '#0.000'}) # acc_talbe_sheet = writer.sheets['accuracy table'] # acc_talbe_sheet.set_column('G:I',None,format) print('write trial results to %s' % output_file)
def setup(env, hparams, algorithm, train_batch_size, num_cpus, num_gpus, num_agents, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): if env == 'harvest': def env_creator(_): return HarvestEnv(num_agents=num_agents) single_env = HarvestEnv() else: def env_creator(_): return CleanupEnv(num_agents=num_agents) single_env = CleanupEnv() env_name = env + "_env" register_env(env_name, env_creator) obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # register the custom model model_name = "conv_to_fc_net" ModelCatalog.register_custom_model(model_name, ConvToFCNet) agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = tune.function(env_creator) config['env_config']['env_name'] = env_name config['env_config']['run'] = algorithm # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": train_batch_size, "horizon": 1000, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": gpus_for_driver, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn), }, "model": { "custom_model": "conv_to_fc_net", "use_lstm": True, "lstm_cell_size": 128 } }) return algorithm, env_name, config
obs_space = test_env.observation_space act_space = test_env.action_space def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PG with an ensemble of `num_policies` different policy graphs policy_graphs = {'av': gen_policy(), 'adversary': gen_policy()} def policy_mapping_fn(agent_id): return agent_id config.update({ 'multiagent': { 'policy_graphs': policy_graphs, 'policy_mapping_fn': tune.function(policy_mapping_fn) } }) exp_tag = { "run": alg_run, "env": env_name, "config": { **config }, "checkpoint_freq": 10, "max_failures": 999, "stop": { "training_iteration": 500 }, "num_samples": 1,
policy_graphs = { a_id: gen_policy() for a_id in env_actor_configs["actors"].keys() } run_experiments({ "MA-PPO-SSUI3CCARLA": { "run": "PPO", "env": env_name, "stop": { "training_iteration": args.num_iters }, "config": { "log_level": "DEBUG", "num_sgd_iter": 10, "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(lambda agent_id: agent_id), }, "num_workers": args.num_workers, "num_envs_per_worker": args.envs_per_worker, "sample_batch_size": args.sample_bs_per_worker, "train_batch_size": args.train_bs }, "checkpoint_freq": 500, "checkpoint_at_end": True, } })
def on_episode_end(info): # print(info) episode = info['episode'] # print(info) # trainer = info['trainer'] base_env = info['env'] episode.custom_metrics['ego_starting_distance'] = base_env.get_unwrapped( )[0].process.ego_starting_distance num_worker_cpus = 11 tune.run( train, name='curriculum_test_1', trial_name_creator=tune.function( lambda trial: 'adaptive_2_.05delta_50target_10deadzone'), config={ 'num_gpus': 1, 'num_workers': num_worker_cpus, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 1.0 / num_worker_cpus, 'sample_batch_size': 200, 'train_batch_size': int(2 * 60.0 / .05), 'batch_mode': 'truncate_episodes', # 'complete_episodes', # 'truncate_episodes', 'timesteps_per_iteration': int(2 * 60 / .05), 'sgd_minibatch_size': 128, # 'shuffle_sequences': True, 'num_sgd_iter': 30, 'gamma': 0.99999, 'lr': 0.0001,
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, grace_period, num_hp_samplings, local_mode, redis_address, lca_n, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) t_id = task['id'] trainable = self.get_trainable(use_ray_logging=use_ray_logging) past_tasks = training_params.pop('past_tasks') normalize = training_params.pop('normalize') augment_data = training_params.pop('augment_data') transformations = [] if augment_data: transformations.extend([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor() ]) t_trans = [[] for _ in range(len(task['split_names']))] t_trans[0] = transformations datasets = trainable._load_datasets(task, task['loss_fn'], past_tasks, t_trans, normalize) train_loader, eval_loaders = get_classic_dataloaders( datasets, training_params.pop('batch_sizes')) model = self.get_model(task_id=t_id, x_dim=task['x_dim'], n_classes=task['n_classes'], descriptor=task['descriptor'], dataset=eval_loaders[:2]) if use_ray: if not ray.is_initialized(): ray.init(address=redis_address) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = { **self.get_search_space(), 'training-params': training_params } if use_ray_logging: stop_condition = { 'training_iteration': training_params['n_it_max'] } checkpoint_at_end = False keep_checkpoints_num = 1 checkpoint_score_attr = 'min-Val nll' else: stop_condition = None # loggers = [JsonLogger, MyCSVLogger] checkpoint_at_end = False keep_checkpoints_num = None checkpoint_score_attr = None trainable = rename_class(trainable, training_params['name']) experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=self.ray_resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=(JsonLogger, CSVLogger), checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr) analysis = tune.run( experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('Val nll', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial.checkpoint.result best_point = (best_res['training_iteration'], best_res['Val nll']) # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] y_keys = ['Val nll', 'Train nll'] epoch_key = 'training_epoch' it_key = 'training_iteration' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) if 'entropy' in next(iter(analysis.trial_dataframes.values())): plot_res_dataframe(analysis, training_params['name'], None, task_viz, epoch_key, it_key, ['entropy']) best_model = self.get_model(task_id=t_id) best_model.load_state_dict(torch.load(best_trial.checkpoint.value)) train_accs = analysis.trial_dataframes[best_logdir][ 'Train accuracy_0'] best_t = best_res['training_iteration'] t = best_trial.last_result['training_iteration'] else: search_space = self.get_search_space() rand_config = list(generate_variants(search_space))[0][1] learner_params = rand_config.pop('learner-params', {}) optim_params = rand_config.pop('optim') split_optims = training_params.pop('split_optims') if hasattr(model, 'set_h_params'): model.set_h_params(**learner_params) if hasattr(model, 'train_loader_wrapper'): train_loader = model.train_loader_wrapper(train_loader) loss_fn = task['loss_fn'] if hasattr(model, 'loss_wrapper'): loss_fn = model.loss_wrapper(task['loss_fn']) prepare_batch = _prepare_batch if hasattr(model, 'prepare_batch_wrapper'): prepare_batch = model.prepare_batch_wrapper( prepare_batch, t_id) optim_fact = partial(set_optim_params, optim_func=self.optim_func, optim_params=optim_params, split_optims=split_optims) if hasattr(model, 'train_func'): f = model.train_func t, metrics, b_state_dict = f(train_loader=train_loader, eval_loaders=eval_loaders, optim_fact=optim_fact, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) else: optim = optim_fact(model=model) t, metrics, b_state_dict = train( model=model, train_loader=train_loader, eval_loaders=eval_loaders, optimizer=optim, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) train_accs = metrics['Train accuracy_0'] best_t = b_state_dict['iter'] if 'training_archs' in metrics: plot_trajectory(model.ssn.graph, metrics['training_archs'], model.ssn.stochastic_node_ids, task_viz) weights = model.arch_sampler().squeeze() archs = model.ssn.get_top_archs(weights, 5) list_top_archs(archs, task_viz) list_arch_scores(self.arch_scores[t_id], task_viz) update_summary(self.arch_scores[t_id], task_viz, 'scores') if len(train_accs) > lca_n: lca_accs = [] for i in range(lca_n + 1): if i in train_accs: lca_accs.append(train_accs[i]) else: logger.warning( 'Missing step for {}/{} for lca computation'.format( i, lca_n)) lca = np.mean(lca_accs) else: lca = np.float('nan') stats = {} start = time.time() # train_idx = task['split_names'].index('Train') # train_path = task['data_path'][train_idx] # train_dataset = _load_datasets([train_path])[0] train_dataset = _load_datasets(task, 'Train')[0] stats.update( self.finish_task(train_dataset, t_id, task_viz, path='drawings')) stats['duration'] = { 'iterations': t, 'finish': time.time() - start, 'best_iterations': best_t } stats['params'] = { 'total': self.n_params(t_id), 'new': self.new_params(t_id) } stats['lca'] = lca return stats
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None, flags=None): from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class import torch horizon = flow_params['env'].horizon if flags.algorithm.lower() == "ppo": alg_run = "PPO" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus #config['num_gpus'] = 0.25 config["horizon"] = horizon #config["train_batch_size"] = horizon * n_rollouts if flags.exp_config== 'singleagent_ring': config["gamma"] = 0.99 # discount rate config["use_gae"] = True # truncated config["lambda"] = 0.99 # truncated value config["kl_target"] = 0.02 # d_target config["num_sgd_iter"] = 15 config["sgd_minibatch_size"] = 1024 config['lr'] = 5e-7 config["clip_param"] = 0.2 config["rollout_fragment_length"] = 3000 config['train_batch_size'] = 3000 elif flags.exp_config=='singleagent_figure_eight': config["gamma"] = 0.99 # discount rate config["use_gae"] = True # truncated config["lambda"] = 1.0 # truncated value 0.97 config["kl_target"] = 0.01 # d_target 0.02 config["num_sgd_iter"] = 30 config['sgd_minibatch_size'] = 64 config["clip_param"] = 0.2 config['lr'] = 1e-05 config['train_batch_size'] = 512 #deep network config['model']['fcnet_hiddens'] = [64, 64] #Exploration config['exploration_config']["type"] = "GaussianNoise" config['exploration_config']["initial_scale"] = 1.0 config['exploration_config']["final_scale"] = 0.05 config['exploration_config']["scale_timesteps"] = 1000000 config['exploration_config']["random_timesteps"] = 1000 config['exploration_config']["stddev"] = 0.1 elif flags.exp_config=='singleagent_merge': config["gamma"] = 0.99 # discount rate config["use_gae"] = True # truncated config["lambda"] = 0.97 # truncated value 0.97 config["kl_target"] = 0.02 # d_target 0.02 config["num_sgd_iter"] = 30 config["sgd_minibatch_size"] = 64 config['lr'] = 1e-7 config["clip_param"] = 0.2 config["train_batch_size"] = 256 # deep network config['model']['fcnet_hiddens'] = [64, 64] # Exploration config['exploration_config']["type"] = "GaussianNoise" config['exploration_config']["initial_scale"] = 1.0 config['exploration_config']["final_scale"] = 0.05 config['exploration_config']["scale_timesteps"] = 3000000 config['exploration_config']["random_timesteps"] = 1000 config['exploration_config']["stddev"] = 0.1 elif flags.algorithm.lower() == "ddpg": from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG alg_run = "DDPG" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus #config["train_batch_size"] = horizon * n_rollouts # model if flags.exp_config == 'singleagent_ring': config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.0001 # in article 'ddpg' config['critic_lr'] = 0.0001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 del config['model']['fcnet_hiddens'] del config['model']['fcnet_activation'] config['lr'] = 0.0001 # exploration config['exploration_config']['final_scale'] = 0.05 config['exploration_config']['scale_timesteps'] = 900000 config['exploration_config']['ou_base_scale'] = 0.1 config['exploration_config']['ou_theta'] = 0.15 config['exploration_config']['ou_sigma'] = 0.2 # optimization config['tau'] = 0.001 config['l2_reg'] = 1e-6 config['train_batch_size'] = 64 config['learning_starts'] = 3000 # evaluation # config['evaluation_interval'] = 5 config['buffer_size'] = 300000 # 3e5 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False #config["prioritized_replay_beta_annealing_timesteps"]=2200000 #config['final_prioritized_replay_beta']=0.01 elif flags.exp_config=='singleagent_figure_eight': config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.00001 # in article 'ddpg' config['critic_lr'] = 0.0001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 # config['model']['fcnet_hiddens'] = [256, 256] config['lr'] = 1e-5 #exploration config['exploration_config']['final_scale'] = 0.02 config['exploration_config']['scale_timesteps'] = 1500000 config['exploration_config']["initial_scale"] = 1.0 config['exploration_config']["random_timesteps"] = 1000 config['exploration_config']["stddev"] = 0.1 del config['exploration_config']['ou_base_scale'] del config['exploration_config']['ou_theta'] del config['exploration_config']['ou_sigma'] config['exploration_config']['type'] = 'GaussianNoise' # optimization config['tau'] = 0.001 config['l2_reg'] = 1e-6 config['train_batch_size'] = 256 config['learning_starts'] = 3000 config['target_network_update_freq'] = 100000 # evaluation config['timesteps_per_iteration'] = 3000 #config['evaluation_interval'] = 5 config['buffer_size'] = 300000 config["prioritized_replay_beta_annealing_timesteps"] = 100000 config['prioritized_replay'] = True else:# merge config['n_step'] = 1 config['actor_hiddens'] = [32, 32] config['actor_lr'] = 0.00001 # in article 'ddpg' config['critic_lr'] = 0.0001 config['critic_hiddens'] = [32, 32] config['gamma'] = 0.99 config['lr'] = 1e-5 # exploration config['exploration_config']['final_scale'] = 0.02 config['exploration_config']['scale_timesteps'] = 2100000 config['exploration_config']['ou_base_scale'] = 0.1 config['exploration_config']['ou_theta'] = 0.15 config['exploration_config']['ou_sigma'] = 0.2 # optimization config['tau'] = 0.001 config['l2_reg'] = 1e-6 config['train_batch_size'] = 128 config['learning_starts'] = 3000 config['target_network_update_freq'] = 3000 # evaluation #config['evaluation_interval'] = 5 config['buffer_size'] = 300000 #3e5 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False elif flags.algorithm.lower() == "td3": from ray.rllib.agents.ddpg.td3 import TD3Trainer alg_run = "TD3" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus #config["train_batch_size"] = horizon * n_rollouts # model if flags.exp_config== 'singleagent_ring': config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.00001 config['critic_lr'] = 0.0001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 config['lr'] = 0.00001 # TD3 config['twin_q'] = True config['policy_delay'] = 2 config['smooth_target_policy'] = True config['target_noise'] = 0.1 # default 0.2 config['target_noise_clip'] = 0.5 # Policy Optimizer # config['optimizer'] = 'Adam' # exploration config['exploration_config']['final_scale'] = 0.05 # default 1 config['exploration_config']['scale_timesteps'] = 1500000 # 900000 # default 1 config['exploration_config']["initial_scale"] = 1.0 config['exploration_config']["random_timesteps"] = 1000 # default 10000 config['exploration_config']["stddev"] = 0.1 config['exploration_config']['type'] = 'GaussianNoise' # optimization config['tau'] = 0.001 # best; fix config['l2_reg'] = 0 config['train_batch_size'] = 128 # default 100; best 128 config['learning_starts'] = 10000 config['use_huber'] = False # evaluation # config['evaluation_interval'] = 5 config['buffer_size'] = 300000 # default 1000000 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False config['worker_side_prioritization'] = False config['use_state_preprocessor'] = False elif flags.exp_config == 'singleagent_figure_eight': # TD3 config['twin_q'] = True config['policy_delay'] = 2 config['smooth_target_policy'] = True config['target_noise'] = 0.2 # default 0.2 config['target_noise_clip'] = 0.5 # model config['n_step'] = 1 config['actor_hiddens'] = [256, 256] config['actor_lr'] = 0.000001 config['critic_lr'] = 0.00001 config['critic_hiddens'] = [256, 256] config['gamma'] = 0.99 config['lr'] = 0.000001 # config['model']['fcnet_hiddens'] = [64, 64] # exploration config['exploration_config']['type'] = 'GaussianNoise' config['exploration_config']['final_scale'] = 0.05 # default 1 config['exploration_config']['scale_timesteps'] = 9000000 # 900000 # default 1 config['exploration_config']["initial_scale"] = 1 config['exploration_config']["random_timesteps"] = 1000 # default 10000 config['exploration_config']["stddev"] = 0.1 # optimization config['tau'] = 0.001 # best; fix config['l2_reg'] = 1e-6 config['train_batch_size'] = 128 # default 100; best 128 config['learning_starts'] = 10000 config['use_huber'] = False config['target_network_update_freq'] = 50000 # evaluation # config['evaluation_interval'] = 5 config['buffer_size'] = 300000 # default 1000000 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False config['worker_side_prioritization'] = False config['use_state_preprocessor'] = False elif flags.exp_config == 'singleagent_merge': # TD3 config['twin_q'] = True config['policy_delay'] = 2 config['smooth_target_policy'] = True config['target_noise'] = 0.2 # default 0.2 config['target_noise_clip'] = 0.5 # model config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.000001 config['critic_lr'] = 0.00001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 config['lr'] = 0.000001 # config['model']['fcnet_hiddens'] = [64, 64] # exploration config['exploration_config']['type'] = 'GaussianNoise' config['exploration_config']['final_scale'] = 0.05 # default 1 config['exploration_config']['scale_timesteps'] = 10000000 # 900000 # default 1 config['exploration_config']["initial_scale"] = 1 config['exploration_config']["random_timesteps"] = 1000 # default 10000 config['exploration_config']["stddev"] = 0.1 # optimization config['tau'] = 0.001 # best; fix config['l2_reg'] = 1e-6 config['train_batch_size'] = 128 # default 100; best 128 config['learning_starts'] = 10000 config['use_huber'] = False config['target_network_update_freq'] = 1 # evaluation # config['evaluation_interval'] = 5 config['buffer_size'] = 300000 # default 1000000 config['timesteps_per_iteration'] = 5000 config['prioritized_replay'] = False config['worker_side_prioritization'] = False config['use_state_preprocessor'] = False #common config config['framework']='torch' config['callbacks'] = { "on_episode_end": None, "on_episode_start": None, "on_episode_step": None, "on_postprocess_traj": None, "on_sample_end": None, "on_train_result": None } # config["opt_type"]= "adam" for impala and APPO, default is SGD # TrainOneStep class call SGD -->execution_plan function can have policy update function print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") print("running algorithm: ", alg_run) # "Framework: ", "torch" # save the flow params for replay flow_json = json.dumps( flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
def setup_exps(flow_params): """Create the relevant components of a multiagent RLlib experiment. Parameters ---------- flow_params : dict input flow-parameters Returns ------- str name of the training algorithm str name of the gym environment to be trained dict training configuration parameters """ alg_run = 'PPO' agent_cls = get_agent_class(alg_run) config = agent_cls._default_config.copy() config['num_workers'] = N_CPUS config['train_batch_size'] = HORIZON * N_ROLLOUTS config['sgd_minibatch_size'] = 4096 #config['simple_optimizer'] = True config['gamma'] = 0.998 # discount rate config['model'].update({'fcnet_hiddens': [100, 50, 25]}) #config['lr'] = tune.grid_search([5e-4, 1e-4]) config['lr_schedule'] = [ [0, 1e-4], [2000000, 5e-5], ] config['horizon'] = HORIZON config['clip_actions'] = False config['observation_filter'] = 'NoFilter' config["use_gae"] = True config["lambda"] = 0.95 config["shuffle_sequences"] = True config["vf_clip_param"] = 1e8 config["num_sgd_iter"] = 10 #config["kl_target"] = 0.003 config["kl_coeff"] = 0.01 config["entropy_coeff"] = 0.001 config["clip_param"] = 0.2 config["grad_clip"] = None config["use_critic"] = True config["vf_share_layers"] = True config["vf_loss_coeff"] = 0.5 # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run create_env, env_name = make_create_env(params=flow_params, version=0) # register as rllib env register_env(env_name, create_env) # multiagent configuration temp_env = create_env() policy_graphs = { 'av': (PPOTFPolicy, temp_env.observation_space, temp_env.action_space, {}) } def policy_mapping_fn(_): return 'av' config.update({ 'multiagent': { 'policies': policy_graphs, 'policy_mapping_fn': tune.function(policy_mapping_fn), 'policies_to_train': ['av'] } }) return alg_run, env_name, config
def setup(algorithm, train_batch_size, num_cpus, num_gpus, num_agents, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): def env_creator(_): return MatrixEnv(matrix_game) single_env = MatrixEnv(matrix_game) env_name = "{}_adv".format(matrix_game) register_env(env_name, env_creator) obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # register the custom model model_name = "matrix_fc_net" ModelCatalog.register_custom_model(model_name, FCNet) agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = tune.function(env_creator) config['env_config']['env_name'] = env_name config['env_config']['run'] = algorithm # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": 30000, "horizon": 100, "lr": 0.001, "num_workers": num_workers, "num_gpus": gpus_for_driver, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction #"entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn), }, "model": { "custom_model": "matrix_fc_net", "use_lstm": True, "lstm_cell_size": 128 }, "callbacks": { "on_episode_start": tune.function(on_episode_start), "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end) } }) return algorithm, env_name, config
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None): """Return the relevant components of an RLlib experiment. Parameters ---------- flow_params : dict flow-specific parameters (see flow/utils/registry.py) n_cpus : int number of CPUs to run the experiment over n_rollouts : int number of rollouts per training iteration policy_graphs : dict, optional TODO policy_mapping_fn : function, optional TODO policies_to_train : list of str, optional TODO Returns ------- str name of the training algorithm str name of the gym environment to be trained dict training configuration parameters """ from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class horizon = flow_params['env'].horizon alg_run = "PPO" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus config["train_batch_size"] = horizon * n_rollouts config["gamma"] = 0.999 # discount rate config["model"].update({"fcnet_hiddens": [32, 32, 32]}) config["use_gae"] = True config["lambda"] = 0.97 config["kl_target"] = 0.02 config["num_sgd_iter"] = 10 config["horizon"] = horizon # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
"lr": 0.0005, "adam_epsilon": 0.0015, "schedule_max_timesteps": 10**7, "exploration_final_eps": 0.02, "exploration_fraction": 0.1, "buffer_size": 10**5, "target_network_update_freq": 50000, "sample_batch_size": 16, "train_batch_size": 64, "observation_filter": "MeanStdFilter", "num_workers": 2, "num_envs_per_worker": 16, "num_cpus_per_worker": 2, "num_cpus_for_driver": 1, "num_gpus": 0, "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn) }, "model": { "fcnet_activation": "tanh", "fcnet_hiddens": [32, 32], }, #"callbacks": { #"on_episode_start": tune.function(on_episode_start), #"on_episode_step": tune.function(on_episode_step), #"on_episode_end": tune.function(on_episode_end), #}, }, )
"asynchyperband_EC_dqn_v2_1": { "run": 'DQN', "env": 'ECglass-v2', "stop": { "timesteps_total": 876000, # "training_iteration": 1 if args.smoke_test else 99999 }, "num_samples": 5, "resources_per_trial": { "cpu": 0.8, "gpu": 0.2 }, "config": { "hiddens": tune.grid_search([[1024, 512], [2048, 1024]]), "callbacks": { "on_episode_start": tune.function(on_episode_start), "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), }, "learning_starts": 64, "buffer_size": 1000000, "exploration_fraction": 1, "train_batch_size": tune.grid_search([250, 200]), "gamma": 0, "exploration_final_eps": tune.grid_search([0.03]), #"exploration_final_eps": 0.05, #"num_workers": 2, "lr": tune.grid_search([0.000001, 0.00005]), "target_network_update_freq": tune.grid_search( [16000, 18000]), "timesteps_per_iteration": 8760,
def on_episode_end(info): # print(info) episode = info['episode'] # print(info) # trainer = info['trainer'] base_env = info['env'] episode.custom_metrics['ego_starting_distance'] = base_env.get_unwrapped( )[0].process.ego_starting_distance num_worker_cpus = 11 tune.run( train, name='curriculum_test_1', trial_name_creator=tune.function( lambda trial: 'adaptive_20m_0-1delta_75target'), config={ 'num_gpus': 1, 'num_workers': num_worker_cpus, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 1.0 / num_worker_cpus, 'sample_batch_size': 200, 'train_batch_size': int(2 * 60.0 / .05), 'batch_mode': 'truncate_episodes', # 'complete_episodes', # 'truncate_episodes', 'timesteps_per_iteration': int(2 * 60 / .05), 'sgd_minibatch_size': 128, # 'shuffle_sequences': True, 'num_sgd_iter': 30, 'gamma': 0.99999, 'lr': 0.0001,
single_env = SimpleMultiAgentEnv(env_config={"scenario_name": args.scenario_name}) # Policy Mapping policies = { agent: (None, single_env.observation_space[agent], single_env.action_space[agent], { "observation_spaces": single_env.observation_space, "action_spaces": single_env.action_space, "agent_id": agent }) for agent in single_env.agent_ids } # Start training ray.init() tune.run( MADDPGTrainer, stop={ "timesteps_total": 1000000, }, config={ "env": "simple_multiagent", "env_config": { "scenario_name": args.scenario_name, "time_limit": 100 }, "multiagent": { "policies": policies, "policy_mapping_fn": tune.function(lambda agent_id: agent_id), }, # "observation_filter": "NoFilter", })
def setup_exps(flow_params, evaluate=False): """Create the relevant components of a multiagent RLlib experiment. Parameters ---------- flow_params : dict input flow-parameters Returns ------- str name of the training algorithm str name of the gym environment to be trained dict training configuration parameters """ alg_run = 'PPO' agent_cls = get_agent_class(alg_run) config = agent_cls._default_config.copy() config['num_workers'] = N_CPUS config['train_batch_size'] = HORIZON * N_ROLLOUTS config['gamma'] = 0.999 # discount rate config['model'].update({'fcnet_hiddens': [256, 256]}) config['lr'] = 2e-5 config['clip_actions'] = False config['observation_filter'] = 'NoFilter' config['simple_optimizer'] = True config['horizon'] = HORIZON # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run flow_params['env'].evaluate = evaluate create_env, env_name = make_create_env(params=flow_params, version=0) # register as rllib env register_env(env_name, create_env) # multiagent configuration temp_env = create_env() policy_graphs = { 'av': (PPOPolicyGraph, temp_env.observation_space, temp_env.action_space, {}) } def policy_mapping_fn(_): return 'av' config.update({ 'multiagent': { 'policy_graphs': policy_graphs, 'policy_mapping_fn': tune.function(policy_mapping_fn), 'policies_to_train': ['av'] } }) return alg_run, env_name, config
"model": { "fcnet_hiddens": [256, 256, 256, 256], "custom_preprocessor": "ohe", "custom_options": {}, # extra options to pass to your preprocessor "fcnet_activation": 'tanh', "use_lstm": True, "max_seq_len": delay + sequence_length, "lstm_cell_size": lstm_cell_size, "lstm_use_prev_action_reward": lstm_use_prev_action_reward, }, "callbacks": { # "on_episode_start": tune.function(on_episode_start), # "on_episode_step": tune.function(on_episode_step), # "on_episode_end": tune.function(on_episode_end), # "on_sample_end": tune.function(on_sample_end), "on_train_result": tune.function(on_train_result), # "on_postprocess_traj": tune.function(on_postprocess_traj), }, # "evaluation_config": { #'seed': 0, #seed # "exploration_fraction": 0, # "exploration_final_eps": 0 # }, # "output": return_hack_writer, # "output_compress_columns": [], }, #return_trials=True # add trials = tune.run( above ) # ag.train() end = time.time()
def _init(self, config, env_creator): self._validate_config() # Update effective batch size to include n-step adjusted_batch_size = max(config["sample_batch_size"], config.get("n_step", 1)) config["sample_batch_size"] = adjusted_batch_size self.exploration0 = self._make_exploration_schedule(-1) self.explorations = [ self._make_exploration_schedule(i) for i in range(config["num_workers"]) ] for k in self._optimizer_shared_configs: if self._name != "DQN" and k in [ "schedule_max_timesteps", "beta_annealing_fraction", "final_prioritized_replay_beta" ]: # only Rainbow needs annealing prioritized_replay_beta continue if k not in config["optimizer"]: config["optimizer"][k] = config[k] if config.get("parameter_noise", False): if config["callbacks"]["on_episode_start"]: start_callback = config["callbacks"]["on_episode_start"] else: start_callback = None def on_episode_start(info): # as a callback function to sample and pose parameter space # noise on the parameters of network policies = info["policy"] for pi in policies.values(): pi.add_parameter_noise() if start_callback: start_callback(info) config["callbacks"]["on_episode_start"] = tune.function( on_episode_start) if config["callbacks"]["on_episode_end"]: end_callback = config["callbacks"]["on_episode_end"] else: end_callback = None def on_episode_end(info): # as a callback function to monitor the distance # between noisy policy and original policy policies = info["policy"] episode = info["episode"] episode.custom_metrics["policy_distance"] = policies[ DEFAULT_POLICY_ID].pi_distance if end_callback: end_callback(info) config["callbacks"]["on_episode_end"] = tune.function( on_episode_end) self.local_evaluator = self.make_local_evaluator( env_creator, self._policy_graph) if config["evaluation_interval"]: self.evaluation_ev = self.make_local_evaluator( env_creator, self._policy_graph, extra_config={ "batch_mode": "complete_episodes", "batch_steps": 1, }) self.evaluation_metrics = self._evaluate() def create_remote_evaluators(): return self.make_remote_evaluators(env_creator, self._policy_graph, config["num_workers"]) if config["optimizer_class"] != "AsyncReplayOptimizer": self.remote_evaluators = create_remote_evaluators() else: # Hack to workaround https://github.com/ray-project/ray/issues/2541 self.remote_evaluators = None self.optimizer = getattr(optimizers, config["optimizer_class"])( self.local_evaluator, self.remote_evaluators, **config["optimizer"]) # Create the remote evaluators *after* the replay actors if self.remote_evaluators is None: self.remote_evaluators = create_remote_evaluators() self.optimizer._set_evaluators(self.remote_evaluators) self.last_target_update_ts = 0 self.num_target_updates = 0
def _init(self): self._validate_config() # Update effective batch size to include n-step adjusted_batch_size = max(self.config["sample_batch_size"], self.config.get("n_step", 1)) self.config["sample_batch_size"] = adjusted_batch_size self.exploration0 = self._make_exploration_schedule(-1) self.explorations = [ self._make_exploration_schedule(i) for i in range(self.config["num_workers"]) ] for k in self._optimizer_shared_configs: if self._agent_name != "DQN" and k in [ "schedule_max_timesteps", "beta_annealing_fraction", "final_prioritized_replay_beta" ]: # only Rainbow needs annealing prioritized_replay_beta continue if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config.get("parameter_noise", False): if self.config["callbacks"]["on_episode_start"]: start_callback = self.config["callbacks"]["on_episode_start"] else: start_callback = None def on_episode_start(info): # as a callback function to sample and pose parameter space # noise on the parameters of network policies = info["policy"] for pi in policies.values(): pi.add_parameter_noise() if start_callback: start_callback(info) self.config["callbacks"]["on_episode_start"] = tune.function( on_episode_start) if self.config["callbacks"]["on_episode_end"]: end_callback = self.config["callbacks"]["on_episode_end"] else: end_callback = None def on_episode_end(info): # as a callback function to monitor the distance # between noisy policy and original policy policies = info["policy"] episode = info["episode"] episode.custom_metrics["policy_distance"] = policies[ "default"].pi_distance if end_callback: end_callback(info) self.config["callbacks"]["on_episode_end"] = tune.function( on_episode_end) self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) if self.config["evaluation_interval"]: self.evaluation_ev = self.make_local_evaluator( self.env_creator, self._policy_graph, extra_config={ "batch_mode": "complete_episodes", "batch_steps": 1, }) self.evaluation_metrics = self._evaluate() def create_remote_evaluators(): return self.make_remote_evaluators(self.env_creator, self._policy_graph, self.config["num_workers"]) if self.config["optimizer_class"] != "AsyncReplayOptimizer": self.remote_evaluators = create_remote_evaluators() else: # Hack to workaround https://github.com/ray-project/ray/issues/2541 self.remote_evaluators = None self.optimizer = getattr(optimizers, self.config["optimizer_class"])( self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) # Create the remote evaluators *after* the replay actors if self.remote_evaluators is None: self.remote_evaluators = create_remote_evaluators() self.optimizer._set_evaluators(self.remote_evaluators) self.last_target_update_ts = 0 self.num_target_updates = 0
model_config = {} eval_config = { "evaluation_interval": 1, # I think this means every x training_iterations "evaluation_config": { "explore": False, "evaluation_num_episodes": 10, "horizon": 100, "env_config": { "dummy_eval": True, # hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats "transition_noise": 0 if "state_space_type" in env_config["env_config"] and env_config["env_config"]["state_space_type"] == "discrete" else tune.function(lambda a: a.normal(0, 0)), "reward_noise": tune.function(lambda a: a.normal(0, 0)), "action_loss_weight": 0.0, }, }, } value_tuples = [] for config_type, config_dict in var_configs.items(): for key in config_dict: assert ( isinstance(var_configs[config_type][key], list) ), "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values" value_tuples.append(var_configs[config_type][key])
def setup_exps_PPO(flow_params): """ Experiment setup with PPO using RLlib. Parameters ---------- flow_params : dictionary of flow parameters Returns ------- str name of the training algorithm str name of the gym environment to be trained dict training configuration parameters """ alg_run = 'PPO' agent_cls = get_agent_class(alg_run) config = agent_cls._default_config.copy() config["num_workers"] = min(N_CPUS, N_ROLLOUTS) config['train_batch_size'] = HORIZON * N_ROLLOUTS config['simple_optimizer'] = True config['gamma'] = 0.999 # discount rate config['model'].update({'fcnet_hiddens': [32, 32]}) config['lr'] = tune.grid_search([1e-5, 1e-4, 1e-3]) config['horizon'] = HORIZON config['clip_actions'] = False # FIXME(ev) temporary ray bug config['observation_filter'] = 'NoFilter' # save the flow params for replay flow_json = json.dumps( flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run create_env, env_name = make_create_env(params=flow_params, version=0) # Register as rllib env register_env(env_name, create_env) test_env = create_env() obs_space = test_env.observation_space act_space = test_env.action_space def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PG with a single policy graph for all agents policy_graphs = {'av': gen_policy()} def policy_mapping_fn(_): return 'av' config.update({ 'multiagent': { 'policy_graphs': policy_graphs, 'policy_mapping_fn': tune.function(policy_mapping_fn), 'policies_to_train': ['av'] } }) return alg_run, env_name, config
}, "gamma": random.choice([0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = { "policy_{}".format(i): gen_policy(i) for i in range(args.num_policies) } policy_ids = list(policy_graphs.keys()) run_experiments({ "test": { "run": "PPO", "env": "multi_cartpole", "stop": { "training_iteration": args.num_iters }, "config": { "log_level": "DEBUG", "num_sgd_iter": 10, "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function( lambda agent_id: random.choice(policy_ids)), }, }, } })
def __init__(self, result_dir, checkpoint_num=150, algo='PPO'): checkpoint_num = str(checkpoint_num) # # config = get_rllib_config(result_dir) # # pkl = get_rllib_pkl(result_dir) #create_env, env_name = make_create_env(params=flow_params, version=0) # Register as rllib env register_env('test', create_env) obs_space = Box(low=0., high=1, shape=(94, ), dtype=np.float32) act_space = Box(low=-1, high=1, shape=(2, ), dtype=np.float32) adv_action_space = Box(low=0., high=1, shape=(22, ), dtype=np.float32) def gen_policy_agent(): return (None, obs_space, act_space, {}) def gen_policy_adversary(): return (None, obs_space, adv_action_space, {}) # <-- old # Setup PG with an ensemble of `num_policies` different policy graphs policy_graphs = { 'av': gen_policy_agent(), 'action_adversary': gen_policy_adversary() } def policy_mapping_fn(agent_id): return agent_id policy_ids = list(policy_graphs.keys()) config = ppo.DEFAULT_CONFIG.copy() config['model'].update({'fcnet_hiddens': [100, 50, 25]}) config["observation_filter"] = "NoFilter" config['simple_optimizer'] = True config.update({ 'multiagent': { 'policy_graphs': policy_graphs, 'policy_mapping_fn': tune.function(policy_mapping_fn) } }) # check if we have a multiagent scenario but in a # backwards compatible way # if config.get('multiagent', {}).get('policy_graphs', {}): # multiagent = True # config['multiagent'] = pkl['multiagent'] # else: # multiagent = False # Run on only one cpu for rendering purposes config['num_workers'] = 0 # flow_params = get_flow_params(config) # # Create and register a gym+rllib env # create_env, env_name = make_create_env( # params=flow_params, version=0, render=False) # register_env(env_name, create_env) # Determine agent and checkpoint agent_cls = get_agent_class(algo) # create the agent that will be used to compute the actions self.agent = agent_cls(env='test', config=config) # agent = agent_cls(config=config) checkpoint = result_dir + '/checkpoint_' + checkpoint_num checkpoint = checkpoint + '/checkpoint-' + checkpoint_num self.agent.restore(checkpoint) multiagent = True if multiagent: rets = {} # map the agent id to its policy self.policy_map_fn = config['multiagent']['policy_mapping_fn'] for key in config['multiagent']['policy_graphs'].keys(): rets[key] = [] else: rets = []
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None, flags=None): from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class import torch horizon = flow_params['env'].horizon from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG alg_run = "DDPG" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = 1 # model config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.0001 # in article 'ddpg' config['critic_lr'] = 0.0001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 config['model']['fcnet_hiddens'] = [64, 64] config['lr'] = 1e-5 # exploration config['exploration_config']['final_scale'] = 0.05 config['exploration_config']['scale_timesteps'] = 1500000 config['exploration_config']['ou_base_scale'] = 0.1 config['exploration_config']['ou_theta'] = 0.15 config['exploration_config']['ou_sigma'] = 0.2 # optimization config['tau'] = 0.001 config['l2_reg'] = 1e-6 config['train_batch_size'] = 64 config['learning_starts'] = 3000 # evaluation #config['evaluation_interval'] = 5 config['buffer_size'] = 300000 #3e5 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False #common config config['framework'] = 'torch' config['callbacks'] = { "on_episode_end": None, "on_episode_start": None, "on_episode_step": None, "on_postprocess_traj": None, "on_sample_end": None, "on_train_result": None } # config["opt_type"]= "adam" for impala and APPO, default is SGD # TrainOneStep class call SGD -->execution_plan function can have policy update function print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") print("running algorithm: ", alg_run) # "Framework: ", "torch" # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None, flags=None): from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class import torch #bmil edit safety = float(flags.safety) if safety < 0 or safety > 2: raise ValueError('--safety option out of value') rate = safety - 1 if safety > 1: flow_params['initial'].reward_params['simple_lc_penalty'] *= (1 + rate) flow_params['initial'].reward_params['rl_action_penalty'] *= ( 1 + 0.2 * rate) elif safety < 1: flow_params['initial'].reward_params['rl_mean_speed'] *= (1 - 0.05 * rate) flow_params['initial'].reward_params['unsafe_penalty'] *= (1 + rate) flow_params['initial'].reward_params['dc3_penalty'] *= (1 + rate) horizon = flow_params['env'].horizon alg_run = "PPO" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus config["horizon"] = horizon config["num_gpus"] = 1 config["gamma"] = 0.99 # discount rate config["use_gae"] = True # truncated config["lambda"] = 0.99 # truncated value config["kl_target"] = 0.02 # d_target config["num_sgd_iter"] = 15 config["sgd_minibatch_size"] = 512 # config['lr']=5e-7 config['lr'] = 1e-6 config["clip_param"] = 0.2 config['train_batch_size'] = 3000 config['rollout_fragment_length'] = 3000 #common config config['framework'] = 'torch' config['callbacks'] = { "on_episode_end": None, "on_episode_start": None, "on_episode_step": None, "on_postprocess_traj": None, "on_sample_end": None, "on_train_result": None } # config["opt_type"]= "adam" for impala and APPO, default is SGD # TrainOneStep class call SGD -->execution_plan function can have policy update function print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") print("running algorithm: ", alg_run) # "Framework: ", "torch" # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--num-iters", type=int, default=2000) args = parser.parse_args() ray.init() trials = tune.run_experiments({ "test": { "env": "CartPole-v0", "run": "PG", "stop": { "training_iteration": args.num_iters, }, "config": { "callbacks": { "on_episode_start": tune.function(on_episode_start), "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), "on_sample_end": tune.function(on_sample_end), "on_train_result": tune.function(on_train_result), }, }, } }) # verify custom metrics for integration tests custom_metrics = trials[0].last_result["custom_metrics"] print(custom_metrics) assert "pole_angle_mean" in custom_metrics assert "pole_angle_min" in custom_metrics assert "pole_angle_max" in custom_metrics
def tune_hyperparameters(model_type: str, experiment_group: str, experiment_name: str): ray_num_cpus = 4 num_cpus_per_process = 1 num_gpus_per_process = 0.5 ray.init(num_cpus=ray_num_cpus, ignore_reinit_error=True, include_webui=False) tuning_config_dir = root_dir('configs/%s/hp_tuning' % model_type) models_dir = root_dir('training/%s/hp_tuning/%s/%s' % (model_type, experiment_group, experiment_name)) ray_results_dir = root_dir('ray_results/%s' % experiment_group) # read the base config with open(os.path.join(tuning_config_dir, 'config.yaml')) as f: base_config = yaml.safe_load(f) # read mutations config with open(os.path.join(tuning_config_dir, 'mutations.yaml')) as f: mutations_grid = yaml.safe_load(f) # get mutated configs mutations = get_mutations(mutations_grid) # use only fraction of GPU session_config = None if num_gpus_per_process < 1: session_config = tf.ConfigProto() session_config.gpu_options.per_process_gpu_memory_fraction = num_gpus_per_process def tune_fn(tune_config, reporter): mutation = tune_config['mutation'] # apply mutation to a base config config = mutate_config(base_config, mutation) # get model's directory model_dir = os.path.join(models_dir, generate_mutation_name(mutation)) # save the config file to the model's directory write_model_config(model_dir, yaml.safe_dump(config)) # train the model model_builder = create_builder(model_type, config) train(model_builder, model_dir, reporter, session_config) configuration = tune.Experiment( experiment_name, run=tune_fn, local_dir=ray_results_dir, config={ 'mutation': tune.grid_search(mutations), }, trial_name_creator=tune.function( lambda trial: generate_mutation_name(trial.config['mutation'])), resources_per_trial={ 'cpu': num_cpus_per_process, 'gpu': num_gpus_per_process, }, ) tune.run_experiments(configuration)