def modify_conf_for_lvl1_training(hp_lvl1, env_config, rllib_config_lvl1, lvl0_checkpoints): lvl0_policy_idx = 1 lvl1_policy_idx = 0 lvl0_policy_id = env_config["players_ids"][lvl0_policy_idx] lvl1_policy_id = env_config["players_ids"][lvl1_policy_idx] # Use a simple DQN as lvl1 agent (instead of amTFT with nested DQN) rllib_config_lvl1["multiagent"]["policies"][lvl1_policy_id] = ( DQNTorchPolicy, hp_lvl1["env"](env_config).OBSERVATION_SPACE, hp_lvl1["env"].ACTION_SPACE, {}) rllib_config_lvl1["callbacks"] = amTFT.get_amTFTCallBacks( additionnal_callbacks=[ log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback, population.PopulationOfIdenticalAlgoCallBacks ]) l1br_configuration_helper = lvl1_best_response.L1BRConfigurationHelper( rllib_config_lvl1, lvl0_policy_id, lvl1_policy_id) l1br_configuration_helper.define_exp( use_n_lvl0_agents_in_each_population=hp_lvl1["n_seeds_lvl0"] // hp_lvl1["n_seeds_lvl1"], train_n_lvl1_agents=hp_lvl1["n_seeds_lvl1"], lvl0_checkpoints=lvl0_checkpoints) rllib_config_lvl1 = l1br_configuration_helper.prepare_config_for_lvl1_training( ) # rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3]["explore"] = False rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3][ "working_state"] = "eval_amtft" return rllib_config_lvl1
def main(debug): ray.init(num_cpus=os.cpu_count(), num_gpus=0) stop = {"episodes_total": 10 if debug else 400} env_config = { "max_steps": 10, "players_ids": ["player_row", "player_col"], } policies = { env_config["players_ids"][0]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}), env_config["players_ids"][1]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}) } rllib_config = { "env": IteratedBoSAndPD, "env_config": env_config, "num_gpus": 0, "num_workers": 1, "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: agent_id), }, "framework": "torch", "gamma": 0.5, "callbacks": miscellaneous.merge_callbacks( log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback), } MyPGTorchPolicy = PGTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.get_postprocessing_welfare_function( add_inequity_aversion_welfare=True, inequity_aversion_beta=1.0, inequity_aversion_alpha=0.0, inequity_aversion_gamma=1.0, inequity_aversion_lambda=0.5), pg_torch_policy.post_process_advantages)) MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy, get_policy_class=None) tune_analysis = tune.run(MyPGTrainer, stop=stop, checkpoint_freq=10, config=rllib_config) ray.shutdown() return tune_analysis
def main(debug, stop_iters=2000, tf=False): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("PPO_AsymCG") ray.init() stop = { "training_iteration": 2 if debug else stop_iters, } env_config = { "players_ids": ["player_red", "player_blue"], "max_steps": 20, "grid_size": 3, "get_additional_info": True, } rllib_config = { "env": AsymCoinGame, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: (None, AsymCoinGame(env_config).OBSERVATION_SPACE, AsymCoinGame.ACTION_SPACE, {}), env_config["players_ids"][1]: (None, AsymCoinGame(env_config).OBSERVATION_SPACE, AsymCoinGame.ACTION_SPACE, {}), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # Size of batches collected from each worker. "rollout_fragment_length": 20, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": 512, "model": { "dim": env_config["grid_size"], "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]] # [Channel, [Kernel, Kernel], Stride]] }, "lr": 5e-3, "seed": tune.grid_search(seeds), "callbacks": log.get_logging_callbacks_class(), "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "tf" if tf else "torch", } tune_analysis = tune.run(PPOTrainer, config=rllib_config, stop=stop, checkpoint_freq=0, checkpoint_at_end=True, name=exp_name) ray.shutdown() return tune_analysis
def get_rllib_config(seeds, debug=False, stop_iters=200, tf=False): stop_config = { "training_iteration": 2 if debug else stop_iters, } env_config = { "players_ids": ["player_row", "player_col"], "max_steps": 20, "get_additional_info": True, } rllib_config = { "env": IteratedPrisonersDilemma, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: ( None, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, {} ), env_config["players_ids"][1]: ( None, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, {} ), }, "policy_mapping_fn": lambda agent_id: agent_id, }, "seed": tune.grid_search(seeds), "callbacks": log.get_logging_callbacks_class(), "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "tf" if tf else "torch", } return rllib_config, stop_config
def _modify_policy_to_use_welfare(rllib_config, welfare): MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn(), postprocess_nstep_and_prio, )) policies = rllib_config["multiagent"]["policies"] new_policies = {} for policies_id, policy_tuple in policies.items(): new_policies[policies_id] = list(policy_tuple) new_policies[policies_id][0] = MyCoopDQNTorchPolicy if welfare == postprocessing.WELFARE_UTILITARIAN: new_policies[policies_id][3].update( {postprocessing.ADD_UTILITARIAN_WELFARE: True}) elif welfare == postprocessing.WELFARE_INEQUITY_AVERSION: add_ia_w = True ia_alpha = 0.0 ia_beta = 0.5 ia_gamma = 0.96 ia_lambda = 0.96 inequity_aversion_parameters = ( add_ia_w, ia_alpha, ia_beta, ia_gamma, ia_lambda, ) new_policies[policies_id][3].update({ postprocessing.ADD_INEQUITY_AVERSION_WELFARE: inequity_aversion_parameters }) rllib_config["multiagent"]["policies"] = new_policies rllib_config["callbacks"] = callbacks.merge_callbacks( log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback, ) return rllib_config
def modify_conf_for_lvl1_training(hp_lvl1, env_config, rllib_config_lvl1, lvl0_checkpoints): lvl0_policy_idx = 1 lvl1_policy_idx = 0 lvl0_policy_id = env_config["players_ids"][lvl0_policy_idx] lvl1_policy_id = env_config["players_ids"][lvl1_policy_idx] # Use a simple DQN as lvl1 agent (instead of amTFT with nested DQN) rllib_config_lvl1["multiagent"]["policies"][lvl1_policy_id] = ( DQNTorchPolicy, hp_lvl1["env_class"](env_config).OBSERVATION_SPACE, hp_lvl1["env_class"].ACTION_SPACE, {}, ) rllib_config_lvl1["callbacks"] = callbacks.merge_callbacks( amTFT.AmTFTCallbacks, log.get_logging_callbacks_class(log_full_epi=False, log_full_epi_interval=100), ) l1br_configuration_helper = lvl1_best_response.L1BRConfigurationHelper( rllib_config_lvl1, lvl0_policy_id, lvl1_policy_id) l1br_configuration_helper.define_exp( use_n_lvl0_agents_in_each_population=hp_lvl1["n_seeds_lvl0"] // hp_lvl1["n_seeds_lvl1"], train_n_lvl1_agents=hp_lvl1["n_seeds_lvl1"], lvl0_checkpoints=lvl0_checkpoints, ) rllib_config_lvl1 = ( l1br_configuration_helper.prepare_config_for_lvl1_training()) rllib_config_lvl1["multiagent"]["policies"][lvl0_policy_id][3][ "working_state"] = "eval_amtft" return rllib_config_lvl1
def get_rllib_config(hp: dict, lvl1_idx: list, lvl1_training: bool): assert lvl1_training tune_config, _, env_config = get_tune_config(hp=hp) tune_config["seed"] = 2020 stop = {"episodes_total": hp["n_epi"]} after_init_fn = functools.partial( miscellaneous.sequence_of_fn_wt_same_args, function_list=[restore.after_init_load_policy_checkpoint, after_init], ) def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer": return torch.optim.SGD( policy.q_func_vars, lr=policy.cur_lr, momentum=config["sgd_momentum"], ) MyDQNTorchPolicy = DQNTorchPolicy.with_updates( stats_fn=log.augment_stats_fn_wt_additionnal_logs(build_q_stats), optimizer_fn=sgd_optimizer_dqn, after_init=after_init_fn, ) if tune_config["env_class"] in ( IteratedPrisonersDilemma, IteratedBoS, IteratedAsymChicken, IteratedAsymBoS, ): env_config.update({ "max_steps": hp["n_steps_per_epi"], }) elif tune_config["env_class"] in ( VectorizedCoinGame, AsymVectorizedCoinGame, ): env_config.update({ "max_steps": hp["n_steps_per_epi"], "batch_size": 1, }) else: raise ValueError() tune_config["TuneTrainerClass"] = hp["tune_class"] tune_config["TuneTrainerClass"] = hp["tune_class"] tune_config["env_config"] = env_config policies = {} for policy_idx, policy_id in enumerate(env_config["players_ids"]): if policy_idx not in lvl1_idx: policies[policy_id] = ( policy.get_tune_policy_class(DQNTorchPolicy), tune_config["env_class"](env_config).OBSERVATION_SPACE, tune_config["env_class"].ACTION_SPACE, { "sgd_momentum": hp["sgd_momentum"], "tune_config": tune_config, }, ) else: policies[policy_id] = ( MyDQNTorchPolicy, tune_config["env_class"](env_config).OBSERVATION_SPACE, tune_config["env_class"].ACTION_SPACE, { "sgd_momentum": hp["sgd_momentum"] }, ) rllib_config = { "env": tune_config["env_class"], "env_config": env_config, "multiagent": { "policies": policies, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4, # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [64], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": [64], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, "gamma": hp["gamma"], "min_iter_time_s": 3.0, # Can't restaure stuff with search # "seed": hp["seed"], "seed": tune.grid_search( hp["lvl1_seeds"] if lvl1_training else hp["lvl0_seeds"]), # "evaluation_num_episodes": 100, # "evaluation_interval": hparams["n_epi"], # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [ (0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9), ], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"] or PiecewiseSchedule( endpoints=[ (0, 10.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1), ], outside_value=0.1, framework="torch", ), }, # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LE supports only 1 worker only # otherwise it would be mixing several opponents trajectories "num_workers": 0, # LE supports only 1 env per worker # only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and # `examples/custom_metrics_and_callbacks.py` # for more usage information. "callbacks": callbacks.merge_callbacks( log.get_logging_callbacks_class(), callbacks.PolicyCallbacks # population.PopulationOfIdenticalAlgoCallBacks ), "log_level": "INFO", } if "CoinGame" in hp["env_name"]: rllib_config["model"] = { "dim": env_config["grid_size"], # [Channel, [Kernel, Kernel], Stride]] "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], } return stop, env_config, rllib_config
def _generate_eval_config(tune_hp, debug): rllib_hp = copy.deepcopy(tune_hp) rllib_hp["seed"] = 2020 rllib_hp["num_episodes"] = 1 if debug else 100 tune_config, stop, env_config = _get_tune_config(rllib_hp, stop_on_epi_number=True) rllib_hp["env_class"] = tune_config["env_class"] if "CoinGame" in tune_config["env_name"]: env_config["batch_size"] = 1 tune_config["TuneTrainerClass"] = train_cg_tune_class_API.LOLAPGCG else: tune_config["TuneTrainerClass"] = LOLAPGMatrice rllib_config_eval = { "env": rllib_hp["env_class"], "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: ( # The default policy is DQN defined in DQNTrainer # but we overwrite it to use the LE policy policy.get_tune_policy_class(DQNTorchPolicy), rllib_hp["env_class"](env_config).OBSERVATION_SPACE, rllib_hp["env_class"].ACTION_SPACE, { "tune_config": tune_config }, ), env_config["players_ids"][1]: ( policy.get_tune_policy_class(DQNTorchPolicy), rllib_hp["env_class"](env_config).OBSERVATION_SPACE, rllib_hp["env_class"].ACTION_SPACE, { "tune_config": tune_config }, ), }, "policy_mapping_fn": lambda agent_id: agent_id, "policies_to_train": ["None"], }, "seed": rllib_hp["seed"], "min_iter_time_s": 3.0, "callbacks": log.get_logging_callbacks_class(log_full_epi=True, ), } policies_to_load = copy.deepcopy(env_config["players_ids"]) if "CoinGame" in rllib_hp["env_name"]: trainable_class = train_cg_tune_class_API.LOLAPGCG rllib_config_eval["model"] = { "dim": env_config["grid_size"], # [Channel, [Kernel, Kernel], Stride]] "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], } else: trainable_class = LOLAPGMatrice return ( rllib_hp, rllib_config_eval, policies_to_load, trainable_class, stop, env_config, )
def _get_rllib_configs(hp, env_class=None): stop_config = { "episodes_total": 2 if hp["debug"] else hp["n_epi"], } env_config = { "players_ids": ["player_red", "player_blue"], "max_steps": hp["n_steps_per_epi"], "grid_size": 3, "get_additional_info": True, } env_class = coin_game.CoinGame if env_class is None else env_class rllib_config = { "env": env_class, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: ( augmented_dqn.MyDQNTorchPolicy, env_class(env_config).OBSERVATION_SPACE, env_class.ACTION_SPACE, {}), env_config["players_ids"][1]: ( augmented_dqn.MyDQNTorchPolicy, env_class(env_config).OBSERVATION_SPACE, env_class.ACTION_SPACE, {}), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": tune.sample_from( lambda spec: int(spec.config["env_config"]["max_steps"] * 30)), # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": tune.sample_from( lambda spec: int(spec.config["env_config"]["max_steps"] * spec.stop["episodes_total"] * hp["buf_frac"])), # Whether to use dueling dqn "dueling": False, # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "rollout_fragment_length": tune.sample_from( lambda spec: spec.config["env_config"]["max_steps"]), "training_intensity": 10, # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": tune.sample_from( lambda spec: int(spec.config["env_config"]["max_steps"] * hp["bs_epi_mul"])), "batch_mode": "complete_episodes", # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy"). # "type": exploration.SoftQSchedule, "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": tune.sample_from( lambda spec: PiecewiseSchedule( endpoints=[ (0, 2.0), (int(spec.config["env_config"]["max_steps"] * spec.stop["episodes_total"] * 0.20), 0.5), (int(spec.config["env_config"]["max_steps"] * spec.stop["episodes_total"] * 0.60), hp["last_exploration_temp_value"])], outside_value=hp["last_exploration_temp_value"], framework="torch")), }, # Size of batches collected from each worker. "model": { "dim": env_config["grid_size"], # [Channel, [Kernel, Kernel], Stride]] "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]] }, "gamma": 0.96, "optimizer": {"sgd_momentum": 0.9, }, "lr": 0.1, "lr_schedule": tune.sample_from( lambda spec: [ (0, 0.0), (int(spec.config["env_config"]["max_steps"] * spec.stop["episodes_total"] * 0.05), spec.config.lr), (int(spec.config["env_config"]["max_steps"] * spec.stop["episodes_total"]), spec.config.lr / 1e9) ] ), "seed": tune.grid_search(hp["seeds"]), "callbacks": log.get_logging_callbacks_class(), "framework": "torch", "logger_config": { "wandb": { "project": "DQN_CG", "group": hp["exp_name"], "api_key_file": os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"), "log_config": True }, }, } return rllib_config, stop_config
def get_rllib_config(hp, welfare_fn): stop = { "episodes_total": hp["n_epi"], } env_config = get_env_config(hp) policies = get_policies(hp, env_config, welfare_fn) selected_seeds = hp["seeds"][:hp["train_n_replicates"]] hp["seeds"] = hp["seeds"][hp["train_n_replicates"]:] trainer_config_update = { "env": hp["env"], "env_config": env_config, "multiagent": { "policies": policies, "policy_mapping_fn": lambda agent_id: agent_id, }, "gamma": hp["gamma"], "min_iter_time_s": hp["min_iter_time_s"], "seed": tune.grid_search(selected_seeds), # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LE supports only 1 worker only otherwise it would be mixing several opponents trajectories "num_workers": 0, # LE supports only 1 env per worker only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py` # for more usage information. "callbacks": amTFT.get_amTFTCallBacks(additionnal_callbacks=[ log.get_logging_callbacks_class(), # This only overwrite the reward that is used for training not the one in the metrics postprocessing.OverwriteRewardWtWelfareCallback ]), # "log_level": "INFO", } trainer_config_update.update({ # === DQN Models === # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4, # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": hp["hiddens"], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": hp["hiddens"], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"] or PiecewiseSchedule(endpoints=[ (0, 10.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1) ], outside_value=0.1, framework="torch"), }, }) if hp["env"] in [coin_game.CoinGame, coin_game.AsymCoinGame]: trainer_config_update["model"] = { "dim": env_config["grid_size"], "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], # [Channel, [Kernel, Kernel], Stride]] } return stop, env_config, trainer_config_update
def get_rllib_config(hp, welfare_fn, eval=False): stop = { "episodes_total": hp["n_epi"], } env_config = get_env_config(hp) policies = get_policies(hp, env_config, welfare_fn, eval) selected_seeds = hp["seeds"][:hp["train_n_replicates"]] hp["seeds"] = hp["seeds"][hp["train_n_replicates"]:] rllib_config = { "env": hp["env_class"], "env_config": env_config, "multiagent": { "policies": policies, "policy_mapping_fn": lambda agent_id: agent_id, # When replay_mode=lockstep, RLlib will replay all the agent # transitions at a particular timestep together in a batch. # This allows the policy to implement differentiable shared # computations between agents it controls at that timestep. # When replay_mode=independent, # transitions are replayed independently per policy. # "replay_mode": "lockstep", "observation_fn": amTFT.observation_fn, }, "gamma": hp["gamma"], "seed": tune.grid_search(selected_seeds), # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": hp["lr_schedule"], # If not None, clip gradients during optimization at this value "grad_clip": 1, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), "training_intensity": hp["training_intensity"], # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] * hp["n_epi"] / hp["log_n_points"]), "min_iter_time_s": 0.0, # General config "framework": "torch", # LE supports only 1 worker only otherwise # it would be mixing several opponents trajectories "num_workers": 0, # LE supports only 1 env per worker only otherwise # several episodes would be played at the same time "num_envs_per_worker": 1, # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and # `examples/custom_metrics_and_callbacks.py` for more usage # information. "callbacks": callbacks.merge_callbacks( amTFT.AmTFTCallbacks, log.get_logging_callbacks_class(log_full_epi=True, log_full_epi_interval=100), ), "logger_config": { "wandb": { "project": "amTFT", "group": hp["exp_name"], "api_key_file": os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"), "log_config": True, }, }, # === DQN Models === # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["target_network_update_freq"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5), # Whether to use dueling dqn "dueling": True, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": hp["hiddens"], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": hp["hiddens"], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"], }, } if "CoinGame" in hp["env_name"]: rllib_config["model"] = { "dim": env_config["grid_size"], "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], # [Channel, [Kernel, Kernel], Stride]] } return stop, env_config, rllib_config
def get_rllib_config(hp: dict): stop = { "episodes_total": hp["n_epi"], # 4000 steps in 200 epi } env_config = { "players_ids": ["player_row", "player_col"], "max_steps": hp["n_steps_per_epi"], } MyDQNTorchPolicy = DQNTorchPolicy.with_updates( optimizer_fn=sgd_optimizer_dqn, stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats)) ltft_config = merge_dicts( LTFT_DEFAULT_CONFIG_UPDATE, { "sgd_momentum": 0.9, 'nested_policies': [ # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": MyDQNTorchPolicy, "config_update": {}}, {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": { "learn_action": True, "learn_reward": False, "sgd_momentum": 0.75, "explore": False, "timesteps_per_iteration": hp["n_steps_per_epi"], # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"] * hp["spl_lr_mul"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], "loss_fn": torch.nn.CrossEntropyLoss( weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean') }}, ], } ) MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper( IteratedPrisonersDilemma, reward_uncertainty_std=0.1) rllib_config = { "env": MyUncertainIPD, "env_config": env_config, "multiagent": { "policies": { "player_row": ( # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), "player_col": ( LTFT, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, copy.deepcopy(ltft_config)), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # === DQN Models === # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"], # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]), # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [4], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": [4, 2], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, "gamma": 0.5, "min_iter_time_s": 0.33, "seed": tune.grid_search(hp["seeds"]), # === Optimization === # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": [(0, hp["base_lr"]), (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)], # Adam epsilon hyper parameter # "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": exploration.SoftQSchedule, # Add constructor kwargs here (if any). "temperature_schedule": PiecewiseSchedule( endpoints=[ (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)], outside_value=0.1, framework="torch") }, # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories "num_workers": 0, # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time "num_envs_per_worker": 1, "batch_mode": "complete_episodes", # # === Debug Settings === # # Whether to write episode stats and videos to the agent log dir. This is # # typically located in ~/ray_results. # "monitor": True, # # Set the ray.rllib.* log level for the agent process and its workers. # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also # # periodically print out summaries of relevant internal dataflow (this is # # also printed out once at startup at the INFO level). When using the # # `rllib train` command, you can also use the `-v` and `-vv` flags as # # shorthand for INFO and DEBUG. # "log_level": "INFO", # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py` # for more usage information. # "callbacks": DefaultCallbacks, "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks, log.get_logging_callbacks_class()), # # Whether to attempt to continue training if a worker crashes. The number # # of currently healthy workers is reported as the "num_healthy_workers" # # metric. # "ignore_worker_failures": False, # # Log system resource metrics to results. This requires `psutil` to be # # installed for sys stats, and `gputil` for GPU metrics. # "log_sys_usage": True, # # Use fake (infinite speed) sampler. For testing only. # "fake_sampler": False, } return rllib_config, env_config, stop
def _get_rllib_config(hp: dict): stop = { "episodes_total": hp["n_epi"], } env_config = _get_env_config(hp) my_uncertain_env_class = add_RewardUncertaintyEnvClassWrapper( hp["env_class"], reward_uncertainty_std=hp["reward_uncertainty_std"]) rllib_config = copy.deepcopy(ltft.DEFAULT_CONFIG) rllib_config.update({ "env": my_uncertain_env_class, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: ( None, hp["env_class"]({}).OBSERVATION_SPACE, hp["env_class"].ACTION_SPACE, {}, ), env_config["players_ids"][1]: ( None, hp["env_class"]({}).OBSERVATION_SPACE, hp["env_class"].ACTION_SPACE, {}, ), }, "policy_mapping_fn": lambda agent_id: agent_id, # When replay_mode=lockstep, RLlib will replay all the agent # transitions at a particular timestep together in a batch. # This allows the policy to implement differentiable shared # computations between agents it controls at that timestep. When # replay_mode=independent, # transitions are replayed independently per policy. # "replay_mode": "lockstep", "observation_fn": ltft.observation_fn, }, # === DQN Models === # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 30 * hp["n_steps_per_epi"], # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5), # Whether to use dueling dqn "dueling": False, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": hp["hiddens"], # Whether to use double dqn "double_q": True, # If True prioritized replay buffer will be used. "prioritized_replay": False, "model": { # Number of hidden layers for fully connected net "fcnet_hiddens": hp["hiddens"], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, # === Exploration Settings === # Default exploration behavior, iff `explore`=None is passed into # compute_action(s). # Set to False for no exploration behavior (e.g., for evaluation). "explore": True, # Provide a dict specifying the Exploration object's config. "exploration_config": { # The Exploration class to use. In the simplest case, # this is the name (str) of any class present in the # `rllib.utils.exploration` package. # You can also provide the python class directly or # the full location of your class (e.g. # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy"). "type": exploration.SoftQScheduleWtClustering, # Add constructor kwargs here (if any). "temperature_schedule": hp["temperature_schedule"], "clustering_distance": hp["clustering_distance"], }, "gamma": hp["gamma"], # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] * hp["n_epi"] / hp["log_n_points"]), "min_iter_time_s": 0.0, "seed": tune.grid_search(hp["seeds"]), # === Optimization === "optimizer": { "sgd_momentum": hp["sgd_momentum"], }, # Learning rate for adam optimizer "lr": hp["base_lr"], # Learning rate schedule "lr_schedule": hp["lr_schedule"], # If not None, clip gradients during optimization at this value "grad_clip": 1, # How many steps of the model to sample before learning starts. "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": hp["n_steps_per_epi"], "training_intensity": hp["training_intensity"], # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]), # General config "framework": "torch", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # LTFTTorchPolicy supports only 1 worker only otherwise # it would be mixing several opponents trajectories "num_workers": 0, # LTFTTorchPolicy supports only 1 env per worker only # otherwise several episodes would be played at the same # time "num_envs_per_worker": 1, "batch_mode": "complete_episodes", "logger_config": { "wandb": { "project": "LTFT", "group": hp["exp_name"], "api_key_file": os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"), "log_config": True, }, }, # === Debug Settings === "log_level": "INFO", # Callbacks that will be run during various phases of training. See the # `DefaultCallbacks` class and # `examples/custom_metrics_and_callbacks.py` # for more usage information. "callbacks": callbacks.merge_callbacks( ltft.LTFTCallbacks, log.get_logging_callbacks_class(log_full_epi=True, ), ), }) hp, rllib_config, env_config, stop = _modify_config_for_coin_game( hp, rllib_config, env_config, stop) nested_policies_config = rllib_config["nested_policies"] nested_spl_policy_config = nested_policies_config[3]["config_update"] nested_spl_policy_config["train_batch_size"] = (int( hp["n_steps_per_epi"] * hp["bs_epi_mul_spl"]), ) rllib_config["nested_policies"] = nested_policies_config return rllib_config, env_config, stop