def train_one_iteration( iter_id, exp_name, init_yaml_path, config, stop_criterion, num_seeds=1, num_gpus=0, test_mode=False ): assert isinstance(iter_id, int) assert isinstance(exp_name, str) assert isinstance(stop_criterion, dict) assert isinstance(init_yaml_path, str) assert osp.exists(init_yaml_path) local_dir = get_local_dir() if get_local_dir() else "~/ray_results" local_dir = os.path.expanduser(local_dir) save_path = os.path.join(local_dir, exp_name) current_yaml_path = init_yaml_path assert 'seed' not in exp_name, exp_name assert 'iter' not in exp_name, exp_name for i in range(num_seeds): input_exp_name = exp_name + "_seed{}_iter{}".format(i, iter_id) tmp_config = copy.deepcopy(config) tmp_config.update(seed=i) tmp_config['env_config']['yaml_path'] = current_yaml_path initialize_ray( num_gpus=num_gpus, test_mode=test_mode, local_mode=test_mode ) tune.run( "PPO", name=input_exp_name, verbose=2 if test_mode else 1, local_dir=save_path, checkpoint_freq=10, checkpoint_at_end=True, stop=stop_criterion, config=tmp_config ) name_ckpt_mapping = read_yaml(current_yaml_path) ckpt_path = _search_ckpt(save_path, input_exp_name) last_ckpt_dict = copy.deepcopy(list(name_ckpt_mapping.values())[-1]) assert isinstance(last_ckpt_dict, dict), last_ckpt_dict assert 'path' in last_ckpt_dict, last_ckpt_dict last_ckpt_dict.update(path=ckpt_path) print("Finish the current last_ckpt_dict: ", last_ckpt_dict) name_ckpt_mapping[input_exp_name] = last_ckpt_dict current_yaml_path = osp.join(save_path, "post_agent_ppo.yaml") out = save_yaml(name_ckpt_mapping, current_yaml_path) assert out == current_yaml_path
def test_marl_individual_ppo(extra_config, local_mode=True, test_mode=True): num_gpus = 0 exp_name = "test_marl_individual_ppo" env_name = "BipedalWalker-v2" num_iters = 50 num_agents = 8 initialize_ray( test_mode=test_mode, num_gpus=num_gpus, local_mode=local_mode ) tmp_env = get_env_maker(env_name)() default_policy = ( None, tmp_env.observation_space, tmp_env.action_space, {} ) policy_names = ["ppo_agent{}".format(i) for i in range(num_agents)] def policy_mapping_fn(aid): # print("input aid: ", aid) return aid config = { "env": MultiAgentEnvWrapper, "env_config": { "env_name": env_name, "agent_ids": policy_names }, "log_level": "DEBUG", "num_gpus": num_gpus, "multiagent": { "policies": {i: default_policy for i in policy_names}, "policy_mapping_fn": policy_mapping_fn, }, } if isinstance(extra_config, dict): config.update(extra_config) tune.run( "PPO", local_dir=get_local_dir(), name=exp_name, checkpoint_at_end=True, checkpoint_freq=10, stop={"training_iteration": num_iters}, config=config, )
"log_level": "DEBUG" if args.test_mode else "ERROR", # "num_gpus": 0.45, "num_gpus": 1, "num_cpus_per_worker": 2, "num_cpus_for_driver": 1, "num_envs_per_worker": 16, "sample_batch_size": 256, "multiagent": { "policies": {i: default_policy for i in policy_names}, "policy_mapping_fn": lambda aid: aid, }, "callbacks": { "on_train_result": on_train_result }, "num_sgd_iter": 10, "seed": tune.grid_search(list(range(args.num_seeds))) if args.num_seeds != 0 else 0 } config.update(run_specify_config[run_name]) tune.run( run_dict[run_name], local_dir=get_local_dir(), name=exp_name, checkpoint_at_end=True, checkpoint_freq=10, stop=run_specify_stop[run_name], config=config, )