def get_nested_policy_class(hp, welfare_fn): NestedPolicyClass = dqn.DQNTorchPolicy get_vars = lambda policy: policy.q_func_vars if not hp["use_adam"]: def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer": return torch.optim.SGD(get_vars(policy), lr=policy.cur_lr, momentum=config["sgd_momentum"]) NestedPolicyClass = NestedPolicyClass.with_updates( optimizer_fn=sgd_optimizer_dqn) if hp["debug"]: NestedPolicyClass = NestedPolicyClass.with_updates( stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats)) CoopNestedPolicyClass = NestedPolicyClass.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.get_postprocessing_welfare_function( add_utilitarian_welfare=welfare_fn == postprocessing.WELFARE_UTILITARIAN, add_inequity_aversion_welfare=welfare_fn == postprocessing.WELFARE_INEQUITY_AVERSION, inequity_aversion_alpha=hp["alpha"], inequity_aversion_beta=hp["beta"], inequity_aversion_gamma=hp["gamma"], inequity_aversion_lambda=hp["lambda"], ), postprocess_nstep_and_prio)) return NestedPolicyClass, CoopNestedPolicyClass
def main(debug): ray.init(num_cpus=os.cpu_count(), num_gpus=0) stop = {"episodes_total": 10 if debug else 400} env_config = { "max_steps": 10, "players_ids": ["player_row", "player_col"], } policies = { env_config["players_ids"][0]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}), env_config["players_ids"][1]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}) } rllib_config = { "env": IteratedBoSAndPD, "env_config": env_config, "num_gpus": 0, "num_workers": 1, "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: agent_id), }, "framework": "torch", "gamma": 0.5, "callbacks": miscellaneous.merge_callbacks( log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback), } MyPGTorchPolicy = PGTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.get_postprocessing_welfare_function( add_inequity_aversion_welfare=True, inequity_aversion_beta=1.0, inequity_aversion_alpha=0.0, inequity_aversion_gamma=1.0, inequity_aversion_lambda=0.5), pg_torch_policy.post_process_advantages)) MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy, get_policy_class=None) tune_analysis = tune.run(MyPGTrainer, stop=stop, checkpoint_freq=10, config=rllib_config) ray.shutdown() return tune_analysis
def get_nested_policy_class(hp, welfare_fn): NestedPolicyClass = amTFT.DEFAULT_NESTED_POLICY_SELFISH CoopNestedPolicyClass = NestedPolicyClass.with_updates( # TODO problem: this prevent to use HP searches on gamma etc. postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn( add_utilitarian_welfare=( welfare_fn == postprocessing.WELFARE_UTILITARIAN), add_inequity_aversion_welfare=( welfare_fn == postprocessing.WELFARE_INEQUITY_AVERSION), inequity_aversion_alpha=hp["alpha"], inequity_aversion_beta=hp["beta"], inequity_aversion_gamma=hp["gamma"], inequity_aversion_lambda=hp["lambda"], ), postprocess_nstep_and_prio, )) return NestedPolicyClass, CoopNestedPolicyClass
def given_an_evader_policy(): for policy_class, postprocessing_fn, default_config in TEST_POLICIES: print("policy_class", policy_class) coop_policy_class = policy_class.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn(), postprocessing_fn)) if "target_network_update_freq" in default_config.keys(): default_config["target_network_update_freq"] = 1 config = merge_dicts( default_config, { "nested_policies": [ { "Policy_class": coop_policy_class, "config_update": { postprocessing.ADD_UTILITARIAN_WELFARE: True }, }, { "Policy_class": policy_class, "config_update": {} }, ], "start_exploit_at_step_n": random.randint(1, 1000), "copy_weights_every_n_steps": random.randint(1, 1000), "welfare_key": postprocessing.WELFARE_UTILITARIAN, }, ) evader = InfluenceEvaderTorchPolicy( observation_space=IteratedPrisonersDilemma.OBSERVATION_SPACE, action_space=IteratedPrisonersDilemma.ACTION_SPACE, config=config, ) yield evader, config["start_exploit_at_step_n"], config[ "copy_weights_every_n_steps"]
def _modify_policy_to_use_welfare(rllib_config, welfare): MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn(), postprocess_nstep_and_prio, )) policies = rllib_config["multiagent"]["policies"] new_policies = {} for policies_id, policy_tuple in policies.items(): new_policies[policies_id] = list(policy_tuple) new_policies[policies_id][0] = MyCoopDQNTorchPolicy if welfare == postprocessing.WELFARE_UTILITARIAN: new_policies[policies_id][3].update( {postprocessing.ADD_UTILITARIAN_WELFARE: True}) elif welfare == postprocessing.WELFARE_INEQUITY_AVERSION: add_ia_w = True ia_alpha = 0.0 ia_beta = 0.5 ia_gamma = 0.96 ia_lambda = 0.96 inequity_aversion_parameters = ( add_ia_w, ia_alpha, ia_beta, ia_gamma, ia_lambda, ) new_policies[policies_id][3].update({ postprocessing.ADD_INEQUITY_AVERSION_WELFARE: inequity_aversion_parameters }) rllib_config["multiagent"]["policies"] = new_policies rllib_config["callbacks"] = callbacks.merge_callbacks( log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback, ) return rllib_config
def _set_config_to_use_exploiter(rllib_config, env_config, hp): exploiter_hp = hp["against_evader_exploiter"] n_steps_during_training = hp["n_epi"] * hp["n_steps_per_epi"] MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn(), postprocess_nstep_and_prio, )) exploiter_policy_config = { "copy_weights_every_n_steps": exploiter_hp["copy_weights_delay"] * n_steps_during_training, "start_exploit_at_step_n": exploiter_hp["start_exploit"] * n_steps_during_training, "welfare_key": postprocessing.WELFARE_UTILITARIAN, "nested_policies": [ # You need to provide the policy class for every nested Policies { "Policy_class": MyCoopDQNTorchPolicy, "config_update": { postprocessing.ADD_UTILITARIAN_WELFARE: True }, }, { "Policy_class": augmented_dqn.MyDQNTorchPolicy, "config_update": {}, }, ], } rllib_config["multiagent"]["policies"][env_config["players_ids"][1]] = ( InfluenceEvaderTorchPolicy, hp["env_class"]().OBSERVATION_SPACE, hp["env_class"].ACTION_SPACE, exploiter_policy_config, ) return rllib_config
APPROXIMATION_METHODS = (APPROXIMATION_METHOD_Q_VALUE, APPROXIMATION_METHOD_ROLLOUTS) WORKING_STATES = ("train_coop", "train_selfish", "eval_amtft", "eval_naive_selfish", "eval_naive_coop") WORKING_STATES_IN_EVALUATION = WORKING_STATES[2:] OWN_COOP_POLICY_IDX = 0 OWN_SELFISH_POLICY_IDX = 1 OPP_COOP_POLICY_IDX = 2 OPP_SELFISH_POLICY_IDX = 3 DEFAULT_NESTED_POLICY_SELFISH = DQNTorchPolicy.with_updates( stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats)) DEFAULT_NESTED_POLICY_COOP = DEFAULT_NESTED_POLICY_SELFISH.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.get_postprocessing_welfare_function( add_utilitarian_welfare=True, ), postprocess_nstep_and_prio)) DEFAULT_CONFIG = merge_dicts( hierarchical.DEFAULT_CONFIG, { # One of WORKING_STATES. "working_state": WORKING_STATES[0], "debit_threshold": 2.0, "punishment_multiplier": 6.0, "rollout_length": 40, "n_rollout_replicas":