Python merge_policy_postprocessing_fn 예제들, marltoolbox.utils.miscellaneous.merge_policy_postprocessing_fn Python 예제들

예제 #1

0

파일 보기

파일: amtft_various_env.py 프로젝트: tobiasbaumann1/amd

def get_nested_policy_class(hp, welfare_fn):
    NestedPolicyClass = dqn.DQNTorchPolicy

    get_vars = lambda policy: policy.q_func_vars

    if not hp["use_adam"]:

        def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
            return torch.optim.SGD(get_vars(policy),
                                   lr=policy.cur_lr,
                                   momentum=config["sgd_momentum"])

        NestedPolicyClass = NestedPolicyClass.with_updates(
            optimizer_fn=sgd_optimizer_dqn)

    if hp["debug"]:
        NestedPolicyClass = NestedPolicyClass.with_updates(
            stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))

    CoopNestedPolicyClass = NestedPolicyClass.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.get_postprocessing_welfare_function(
                add_utilitarian_welfare=welfare_fn ==
                postprocessing.WELFARE_UTILITARIAN,
                add_inequity_aversion_welfare=welfare_fn ==
                postprocessing.WELFARE_INEQUITY_AVERSION,
                inequity_aversion_alpha=hp["alpha"],
                inequity_aversion_beta=hp["beta"],
                inequity_aversion_gamma=hp["gamma"],
                inequity_aversion_lambda=hp["lambda"],
            ), postprocess_nstep_and_prio))
    return NestedPolicyClass, CoopNestedPolicyClass

예제 #2

0

파일 보기

파일: inequity_aversion.py 프로젝트: tobiasbaumann1/amd

def main(debug):
    ray.init(num_cpus=os.cpu_count(), num_gpus=0)

    stop = {"episodes_total": 10 if debug else 400}

    env_config = {
        "max_steps": 10,
        "players_ids": ["player_row", "player_col"],
    }

    policies = {
        env_config["players_ids"][0]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {}),
        env_config["players_ids"][1]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {})
    }

    rllib_config = {
        "env":
        IteratedBoSAndPD,
        "env_config":
        env_config,
        "num_gpus":
        0,
        "num_workers":
        1,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: agent_id),
        },
        "framework":
        "torch",
        "gamma":
        0.5,
        "callbacks":
        miscellaneous.merge_callbacks(
            log.get_logging_callbacks_class(),
            postprocessing.OverwriteRewardWtWelfareCallback),
    }

    MyPGTorchPolicy = PGTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.get_postprocessing_welfare_function(
                add_inequity_aversion_welfare=True,
                inequity_aversion_beta=1.0,
                inequity_aversion_alpha=0.0,
                inequity_aversion_gamma=1.0,
                inequity_aversion_lambda=0.5),
            pg_torch_policy.post_process_advantages))
    MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy,
                                         get_policy_class=None)
    tune_analysis = tune.run(MyPGTrainer,
                             stop=stop,
                             checkpoint_freq=10,
                             config=rllib_config)
    ray.shutdown()
    return tune_analysis

예제 #3

0

파일 보기

파일: amtft_various_env.py 프로젝트: longtermrisk/marltoolbox

def get_nested_policy_class(hp, welfare_fn):
    NestedPolicyClass = amTFT.DEFAULT_NESTED_POLICY_SELFISH
    CoopNestedPolicyClass = NestedPolicyClass.with_updates(
        # TODO problem: this prevent to use HP searches on gamma etc.
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.welfares_postprocessing_fn(
                add_utilitarian_welfare=(
                    welfare_fn == postprocessing.WELFARE_UTILITARIAN),
                add_inequity_aversion_welfare=(
                    welfare_fn == postprocessing.WELFARE_INEQUITY_AVERSION),
                inequity_aversion_alpha=hp["alpha"],
                inequity_aversion_beta=hp["beta"],
                inequity_aversion_gamma=hp["gamma"],
                inequity_aversion_lambda=hp["lambda"],
            ),
            postprocess_nstep_and_prio,
        ))
    return NestedPolicyClass, CoopNestedPolicyClass

예제 #4

0

파일 보기

파일: evader_utils.py 프로젝트: longtermrisk/marltoolbox

def given_an_evader_policy():
    for policy_class, postprocessing_fn, default_config in TEST_POLICIES:
        print("policy_class", policy_class)
        coop_policy_class = policy_class.with_updates(
            postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
                postprocessing.welfares_postprocessing_fn(),
                postprocessing_fn))

        if "target_network_update_freq" in default_config.keys():
            default_config["target_network_update_freq"] = 1

        config = merge_dicts(
            default_config,
            {
                "nested_policies": [
                    {
                        "Policy_class": coop_policy_class,
                        "config_update": {
                            postprocessing.ADD_UTILITARIAN_WELFARE: True
                        },
                    },
                    {
                        "Policy_class": policy_class,
                        "config_update": {}
                    },
                ],
                "start_exploit_at_step_n":
                random.randint(1, 1000),
                "copy_weights_every_n_steps":
                random.randint(1, 1000),
                "welfare_key":
                postprocessing.WELFARE_UTILITARIAN,
            },
        )
        evader = InfluenceEvaderTorchPolicy(
            observation_space=IteratedPrisonersDilemma.OBSERVATION_SPACE,
            action_space=IteratedPrisonersDilemma.ACTION_SPACE,
            config=config,
        )
        yield evader, config["start_exploit_at_step_n"], config[
            "copy_weights_every_n_steps"]

예제 #5

0

파일 보기

def _modify_policy_to_use_welfare(rllib_config, welfare):
    MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.welfares_postprocessing_fn(),
            postprocess_nstep_and_prio,
        ))

    policies = rllib_config["multiagent"]["policies"]
    new_policies = {}
    for policies_id, policy_tuple in policies.items():
        new_policies[policies_id] = list(policy_tuple)
        new_policies[policies_id][0] = MyCoopDQNTorchPolicy
        if welfare == postprocessing.WELFARE_UTILITARIAN:
            new_policies[policies_id][3].update(
                {postprocessing.ADD_UTILITARIAN_WELFARE: True})
        elif welfare == postprocessing.WELFARE_INEQUITY_AVERSION:
            add_ia_w = True
            ia_alpha = 0.0
            ia_beta = 0.5
            ia_gamma = 0.96
            ia_lambda = 0.96
            inequity_aversion_parameters = (
                add_ia_w,
                ia_alpha,
                ia_beta,
                ia_gamma,
                ia_lambda,
            )
            new_policies[policies_id][3].update({
                postprocessing.ADD_INEQUITY_AVERSION_WELFARE:
                inequity_aversion_parameters
            })
    rllib_config["multiagent"]["policies"] = new_policies
    rllib_config["callbacks"] = callbacks.merge_callbacks(
        log.get_logging_callbacks_class(),
        postprocessing.OverwriteRewardWtWelfareCallback,
    )

    return rllib_config

예제 #6

0

파일 보기

def _set_config_to_use_exploiter(rllib_config, env_config, hp):
    exploiter_hp = hp["against_evader_exploiter"]
    n_steps_during_training = hp["n_epi"] * hp["n_steps_per_epi"]

    MyCoopDQNTorchPolicy = augmented_dqn.MyDQNTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.welfares_postprocessing_fn(),
            postprocess_nstep_and_prio,
        ))
    exploiter_policy_config = {
        "copy_weights_every_n_steps":
        exploiter_hp["copy_weights_delay"] * n_steps_during_training,
        "start_exploit_at_step_n":
        exploiter_hp["start_exploit"] * n_steps_during_training,
        "welfare_key":
        postprocessing.WELFARE_UTILITARIAN,
        "nested_policies": [
            # You need to provide the policy class for every nested Policies
            {
                "Policy_class": MyCoopDQNTorchPolicy,
                "config_update": {
                    postprocessing.ADD_UTILITARIAN_WELFARE: True
                },
            },
            {
                "Policy_class": augmented_dqn.MyDQNTorchPolicy,
                "config_update": {},
            },
        ],
    }

    rllib_config["multiagent"]["policies"][env_config["players_ids"][1]] = (
        InfluenceEvaderTorchPolicy,
        hp["env_class"]().OBSERVATION_SPACE,
        hp["env_class"].ACTION_SPACE,
        exploiter_policy_config,
    )

    return rllib_config

예제 #7

0

파일 보기

APPROXIMATION_METHODS = (APPROXIMATION_METHOD_Q_VALUE,
                         APPROXIMATION_METHOD_ROLLOUTS)
WORKING_STATES = ("train_coop", "train_selfish", "eval_amtft",
                  "eval_naive_selfish", "eval_naive_coop")
WORKING_STATES_IN_EVALUATION = WORKING_STATES[2:]

OWN_COOP_POLICY_IDX = 0
OWN_SELFISH_POLICY_IDX = 1
OPP_COOP_POLICY_IDX = 2
OPP_SELFISH_POLICY_IDX = 3

DEFAULT_NESTED_POLICY_SELFISH = DQNTorchPolicy.with_updates(
    stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))
DEFAULT_NESTED_POLICY_COOP = DEFAULT_NESTED_POLICY_SELFISH.with_updates(
    postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
        postprocessing.get_postprocessing_welfare_function(
            add_utilitarian_welfare=True, ), postprocess_nstep_and_prio))

DEFAULT_CONFIG = merge_dicts(
    hierarchical.DEFAULT_CONFIG,
    {
        # One of WORKING_STATES.
        "working_state":
        WORKING_STATES[0],
        "debit_threshold":
        2.0,
        "punishment_multiplier":
        6.0,
        "rollout_length":
        40,
        "n_rollout_replicas":