Exemplo n.º 1
0
def main(debug):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("DQN_CG_speed_search")

    env = "CoinGame"
    # env = "SSDMixedMotiveCoinGame"
    # welfare_to_use = None
    # welfare_to_use = postprocessing.WELFARE_UTILITARIAN
    welfare_to_use = postprocessing.WELFARE_INEQUITY_AVERSION

    if "SSDMixedMotiveCoinGame" in env:
        env_class = ssd_mixed_motive_coin_game.SSDMixedMotiveCoinGame
    else:
        env_class = coin_game.CoinGame

    hparams = _get_hyperparameters(seeds, debug, exp_name)

    rllib_config, stop_config = _get_rllib_configs(
        hparams, env_class=env_class
    )

    if welfare_to_use is not None:
        rllib_config = _modify_policy_to_use_welfare(
            rllib_config, welfare_to_use
        )

    rllib_config, stop_config = _add_search_to_config(
        rllib_config, stop_config, hparams
    )
    tune_analysis = _train_dqn_and_plot_logs(
        hparams, rllib_config, stop_config
    )

    return tune_analysis
Exemplo n.º 2
0
def _get_hyparameters(debug,
                      env=None,
                      train_n_replicates=None,
                      against_naive_opp=False):
    if debug:
        train_n_replicates = 1
    elif train_n_replicates is None:
        train_n_replicates = 4

    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("LTFT")

    hparameters = {
        "seeds": seeds,
        "debug": debug,
        "exp_name": exp_name,
        "hiddens": [64],
        "log_n_points": 260,
        "clustering_distance": 0.2,
        "gamma": 0.96,
        "env_name": "IteratedPrisonersDilemma" if env is None else env,
        # "env_name": "CoinGame" if env is None else env,
        "reward_uncertainty_std": 0.1,
        # "against_evader_exploiter": None,
        "against_evader_exploiter": {
            "start_exploit": 0.75,
            "copy_weights_delay": 0.05,
        } if not against_naive_opp else None,
    }

    hparameters = _modify_hyperparams_for_the_selected_env(hparameters)

    return hparameters, exp_name
Exemplo n.º 3
0
def init_worker(actions_list=None):
    train_n_replicates = 1
    debug = True
    stop_iters = 200
    tf = False
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("testing")

    rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf)
    rllib_config["env"] = FakeEnvWtCstReward
    rllib_config["env_config"]["max_steps"] = EPI_LENGTH
    rllib_config["seed"] = int(time.time())
    if actions_list is not None:
        for policy_id in FakeEnvWtCstReward({}).players_ids:
            policy_to_modify = list(
                rllib_config["multiagent"]["policies"][policy_id]
            )
            policy_to_modify[0] = make_FakePolicyWtDefinedActions(
                copy.deepcopy(actions_list)
            )
            rllib_config["multiagent"]["policies"][
                policy_id
            ] = policy_to_modify

    pg_trainer = PGTrainer(
        rllib_config, logger_creator=_get_logger_creator(exp_name)
    )
    return pg_trainer.workers._local_worker
    def __init__(
        self,
        exp_name: str,
        local_mode: bool = False,
        use_random_policy_from_own_checkpoint: bool = False,
        use_wandb: bool = False,
    ):
        """
        You should take a look at examples using this class.
        Any training is deactivated here. Only the worker rollout will evaluate
        your policy on the environment.
        Any exploration is deactivated.

        Works for a unique pair of RLLib policies.

        :param exp_name: Normal exp_name argument provided to tune.run().
        :param use_random_policy_from_own_checkpoint: (optional, default to False)
        """
        self.default_selected_order = 0
        self.running_in_local_mode = local_mode
        self.use_wandb = use_wandb
        self.exp_name, self.exp_parent_dir = log.log_in_current_day_dir(
            exp_name)
        self.results_file_name = "SelfAndCrossPlay_save.p"
        self.save_path = os.path.join(self.exp_parent_dir,
                                      self.results_file_name)
        # TODO this var name is not clear enough
        self.use_random_policy_from_own_checkpoint = (
            use_random_policy_from_own_checkpoint)

        self.experiment_defined = False
        self.checkpoints_loaded = False
def main(debug, stop_iters=2000, tf=False):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("PPO_AsymCG")

    ray.init()

    stop = {
        "training_iteration": 2 if debug else stop_iters,
    }

    env_config = {
        "players_ids": ["player_red", "player_blue"],
        "max_steps": 20,
        "grid_size": 3,
        "get_additional_info": True,
    }

    rllib_config = {
        "env": AsymCoinGame,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]:
                (None, AsymCoinGame(env_config).OBSERVATION_SPACE,
                 AsymCoinGame.ACTION_SPACE, {}),
                env_config["players_ids"][1]:
                (None, AsymCoinGame(env_config).OBSERVATION_SPACE,
                 AsymCoinGame.ACTION_SPACE, {}),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        # Size of batches collected from each worker.
        "rollout_fragment_length": 20,
        # Number of timesteps collected for each SGD round. This defines the size
        # of each SGD epoch.
        "train_batch_size": 512,
        "model": {
            "dim": env_config["grid_size"],
            "conv_filters": [[16, [3, 3], 1],
                             [32, [3, 3],
                              1]]  # [Channel, [Kernel, Kernel], Stride]]
        },
        "lr": 5e-3,
        "seed": tune.grid_search(seeds),
        "callbacks": log.get_logging_callbacks_class(),
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": "tf" if tf else "torch",
    }

    tune_analysis = tune.run(PPOTrainer,
                             config=rllib_config,
                             stop=stop,
                             checkpoint_freq=0,
                             checkpoint_at_end=True,
                             name=exp_name)
    ray.shutdown()
    return tune_analysis
Exemplo n.º 6
0
def main(debug):
    train_n_replicates = 2 if debug else 40
    timestamp = int(time.time())
    seeds = [seed + timestamp for seed in list(range(train_n_replicates))]

    exp_name, _ = log.log_in_current_day_dir("LOLA_DICE")

    hparams = {

        "load_plot_data": None,
        # IPD
        # Example: "load_plot_data": ".../SameAndCrossPlay_save.p",

        "exp_name": exp_name,
        "train_n_replicates": train_n_replicates,
        "env": "IPD",
        # "env": "IMP",
        # "env": "AsymBoS",
        # "env": "CoinGame",
        # "env": "AsymCoinGame",

        "gamma": None,
        "trace_length": 10 if debug else None,

        "epochs": 2 if debug else 200,
        "lr_inner": .1,
        "lr_outer": .2,
        "lr_value": .1,
        "lr_om": .1,
        "inner_asymm": True,
        "n_agents": 2,
        "n_inner_steps": 1 if debug else 2,
        "batch_size": 10 if debug else 64,
        "value_batch_size": 16,
        "value_epochs": 0,
        "om_batch_size": 16,
        "om_epochs": 0,
        "grid_size": 3,
        "use_baseline": False,
        "use_dice": True,
        "use_opp_modeling": False,

        "seed": tune.grid_search(seeds),
        "metric": "ag_0_returns_player_1",
    }

    if hparams["load_plot_data"] is None:
        ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)
        training_results = train(hparams)
    else:
        training_results = None

    evaluate(training_results, hparams, debug)
    ray.shutdown()
Exemplo n.º 7
0
def main(debug, stop_iters=200, tf=False):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("PG_IPD")

    ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)

    rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf)
    tune_analysis = tune.run(PGTrainer, config=rllib_config, stop=stop_config,
                             checkpoint_freq=0, checkpoint_at_end=True, name=exp_name)
    ray.shutdown()
    return tune_analysis
Exemplo n.º 8
0
def main(debug):
    exp_name, _ = log.log_in_current_day_dir(f"LOLA_PG")

    tune_hparams = {
        "exp_name": exp_name,

        # Dynamically set
        "num_episodes": 3 if debug else None,
        "trace_length": 6 if debug else None,
        "lr": None,
        "gamma": None,
        "batch_size": 12 if debug else None,

        # "exp_name": "IPD",
        # "exp_name": "IMP",
        "exp_name": "CoinGame",
        # "exp_name": "AsymCoinGame",
        "pseudo": False,
        "grid_size": 3,
        "lola_update": True,
        "opp_model": False,
        "mem_efficient": True,
        "lr_correction": 1,
        "bs_mul": 1 / 10,
        "simple_net": True,
        "hidden": 32,
        "reg": 0,
        "set_zero": 0,
        "exact": False,
        "warmup": 1,
        "seed": 1,
        "changed_config": False,
        "ac_lr": 1.0,
        "summary_len": 1,
        "use_MAE": False,
        "use_toolbox_env": True,
        "clip_loss_norm": False,
        "clip_lola_update_norm": False,
        "clip_lola_correction_norm": 3.0,
        "clip_lola_actor_norm": 10.0,
        "entropy_coeff": 0.001,
        "weigth_decay": 0.03,
    }
    tune_config = get_tune_config(tune_hparams)

    ray.init(num_cpus=os.cpu_count(), num_gpus=0)
    tune_analysis = tune.run(lola_training,
                             name=tune_hparams["exp_name"],
                             config=tune_config)
    ray.shutdown()

    return tune_analysis
Exemplo n.º 9
0
def main(debug, welfare=postprocessing.WELFARE_UTILITARIAN):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("DQN_welfare_CG")

    hparams = dqn_coin_game._get_hyperparameters(seeds, debug, exp_name)
    rllib_config, stop_config = dqn_coin_game._get_rllib_configs(hparams)
    rllib_config = _modify_policy_to_use_welfare(rllib_config, welfare)

    tune_analysis = dqn_coin_game._train_dqn_and_plot_logs(
        hparams, rllib_config, stop_config)

    return tune_analysis
Exemplo n.º 10
0
def main(debug):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("DQN_CG")

    hparams = _get_hyperparameters(seeds, debug, exp_name)

    rllib_config, stop_config = _get_rllib_configs(hparams)

    tune_analysis = _train_dqn_and_plot_logs(
        hparams, rllib_config, stop_config)

    return tune_analysis
Exemplo n.º 11
0
def main(debug):
    exp_name, _ = log.log_in_current_day_dir("L1BR_amTFT")

    train_n_replicates = 4 if debug else 8
    pool_of_seeds = miscellaneous.get_random_seeds(train_n_replicates)
    hparams = {
        "debug": debug,
        "filter_utilitarian": False,
        "train_n_replicates": train_n_replicates,
        "seeds": pool_of_seeds,
        "exp_name": exp_name,
        "n_steps_per_epi": 20,
        "bs_epi_mul": 4,
        "welfare_functions":
        [(postprocessing.WELFARE_UTILITARIAN, "utilitarian")],
        "amTFTPolicy": amTFT.amTFTRolloutsTorchPolicy,
        "explore_during_evaluation": True,
        "n_seeds_lvl0": train_n_replicates,
        "n_seeds_lvl1": train_n_replicates // 2,
        "gamma": 0.5,
        "lambda": 0.9,
        "alpha": 0.0,
        "beta": 1.0,
        "temperature_schedule": False,
        "debit_threshold": 4.0,
        "jitter": 0.05,
        "hiddens": [64],
        "env": matrix_sequential_social_dilemma.IteratedPrisonersDilemma,
        # "env": matrix_sequential_social_dilemma.IteratedAsymBoS,
        # "env": matrix_sequential_social_dilemma.IteratedAsymChicken,
        # "env": coin_game.CoinGame
        # "env": coin_game.AsymCoinGame

        # For training speed
        "min_iter_time_s": 0.0 if debug else 3.0,
        "overwrite_reward": True,
        "use_adam": False,
    }

    ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=hparams["debug"])

    hparams = amtft_various_env.modify_hyperparams_for_the_selected_env(
        hparams)
    lvl0_tune_analysis = train_lvl0_population(hp=hparams)
    tune_analysis_lvl1 = train_lvl1_agents(
        hp_lvl1=copy.deepcopy(hparams), tune_analysis_lvl0=lvl0_tune_analysis)
    print(tune_analysis_lvl1.results_df.columns)
    print(tune_analysis_lvl1.results_df.head())

    ray.shutdown()
Exemplo n.º 12
0
def main(debug, use_rllib_policy=False):
    """
    The planner is not modified yet to work with
    policies/agents created with RLLib.
    """
    if use_rllib_policy:
        logger.warning(
            f"not possible to use the planner with "
            "use_rllib_polcy: {use_rllib_policy}"
        )

    train_n_replicates = 1 if debug else 5
    timestamp = int(time.time())
    seeds = [seed + timestamp for seed in list(range(train_n_replicates))]

    exp_name, _ = log.log_in_current_day_dir("adaptive_mechanism_design")

    hyperparameters = {
        "exp_name": exp_name,
        "seed": tune.grid_search(seeds),
        "debug": debug,
        "report_every_n": 1,
        "fear": 1,
        # "greed": -1,
        # Selecting greed = 1 to be sure that
        # the agents without planner learns DD
        "greed": 1,
        # (needed when using the  not simple network)
        "with_redistribution": False,
        "n_planning_eps": math.inf,
        "value_fn_variant": "exact",
        # "value_fn_variant": 'estimated',
        # "value_fn_variant": tune.grid_search(['exact', 'estimated']),
        "action_flip_prob": 0,
        "n_players": 2,
        "with_planner": True and not use_rllib_policy,
        # "with_planner": False,
        # "with_planner": tune.grid_search([True, False]),
        # "env": "FearGreedMatrix",
        "env": "CoinGame",
        "normalize_against_vp": False,
        "normalize_against_v": False,
        "normalize_vp_separated": False,
        "use_rllib_polcy": use_rllib_policy,
    }

    hyperparameters = add_env_hp(hyperparameters)

    train(hyperparameters)
Exemplo n.º 13
0
def _init_evaluator():
    exp_name, _ = log.log_in_current_day_dir("testing")

    rllib_config, stop_config = get_rllib_config(seeds=get_random_seeds(1))

    evaluator = self_and_cross_perf.SelfAndCrossPlayEvaluator(
        exp_name=exp_name,
    )
    evaluator.define_the_experiment_to_run(
        evaluation_config=rllib_config,
        stop_config=stop_config,
        TrainerClass=PGTrainer,
    )

    return evaluator
Exemplo n.º 14
0
def main(debug):
    exp_name, _ = log.log_in_current_day_dir(f"LOLA_DICE")

    tune_hparams = {
        "debug": debug,
        "exp_name": exp_name,

        # "env_name": "IPD",
        # "env_name": "IMP",
        "env_name": "CoinGame",
        # "env_name": "AsymCoinGame",

        "gamma": None,
        "trace_length": None,

        "epochs": 0.2 if debug else 200,
        "lr_inner": .1,
        "lr_outer": .2,
        "lr_value": .1,
        "lr_om": .1,
        "inner_asymm": True,
        "n_agents": 2,
        "n_inner_steps": 1 if debug else 2,
        "batch_size": 4 if debug else 64,
        "value_batch_size": 16,
        "value_epochs": 0,
        "om_batch_size": 16,
        "om_epochs": 0,
        "grid_size": 3,
        "use_baseline": False,
        "use_dice": True,
        "use_opp_modeling": False,

        "seed": 1 if debug else tune.grid_search([1, 2, 3, 4, 5]),
    }

    tune_config = get_tune_config(tune_hparams)

    ray.init(num_cpus=os.cpu_count(), num_gpus=0)
    tune_analysis = tune.run(lola_training, name=tune_hparams["exp_name"], config=tune_config)
    ray.shutdown()

    return tune_analysis
Exemplo n.º 15
0
def main(debug):
    train_n_replicates = 2 if debug else 40
    seeds = miscellaneous.get_random_seeds(train_n_replicates)

    exp_name, _ = log.log_in_current_day_dir("LOLA_Exact")

    hparams = {
        "load_plot_data": None,
        # Example "load_plot_data": ".../SameAndCrossPlay_save.p",
        "exp_name": exp_name,
        "train_n_replicates": train_n_replicates,
        "env": "IPD",
        # "env": "IMP",
        # "env": "AsymBoS",
        "num_episodes": 5 if debug else 50,
        "trace_length": 5 if debug else 200,
        "simple_net": True,
        "corrections": True,
        "pseudo": False,
        "num_hidden": 32,
        "reg": 0.0,
        "lr": 1.,
        "lr_correction": 1.0,
        "gamma": 0.96,
        "seed": tune.grid_search(seeds),
        "metric": "ret1",
        "with_linear_LR_decay_to_zero": False,
        "clip_update": None,

        # "with_linear_LR_decay_to_zero": True,
        # "clip_update": 0.1,
        # "lr": 0.001,
    }

    if hparams["load_plot_data"] is None:
        ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)
        tune_analysis_per_exp = train(hparams)
    else:
        tune_analysis_per_exp = None

    evaluate(tune_analysis_per_exp, hparams)
    ray.shutdown()
Exemplo n.º 16
0
def _train_pg_in_ipd(train_n_replicates):
    debug = True
    stop_iters = 200
    tf = False
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("testing")

    ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)

    rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf)
    tune_analysis = tune.run(PGTrainer,
                             config=rllib_config,
                             stop=stop_config,
                             checkpoint_freq=0,
                             checkpoint_at_end=True,
                             name=exp_name,
                             metric="episode_reward_mean",
                             mode="max")
    ray.shutdown()
    return tune_analysis, seeds
Exemplo n.º 17
0
def main(debug):
    train_n_replicates = 1 if debug else 1
    timestamp = int(time.time())
    seeds = [seed + timestamp for seed in list(range(train_n_replicates))]

    exp_name, _ = log.log_in_current_day_dir("adaptive_mechanism_design")

    hyperparameters = {
        "exp_name": exp_name,
        "seed": tune.grid_search(seeds),
        "debug": debug,
        "report_every_n": 1,
        "fear": 1,

        # "greed": -1,
        "greed":
        1,  # Selecting greed = 1 to be sure that the agents without planner learns DD
        # (needed when using the  not simple network)
        "with_redistribution": False,
        "n_planning_eps": math.inf,
        "value_fn_variant": 'exact',
        # "value_fn_variant": 'estimated',
        # "value_fn_variant": tune.grid_search(['exact', 'estimated']),
        "action_flip_prob": 0,
        "n_players": 2,

        # "with_planner": True,
        "with_planner": False,
        # "with_planner": tune.grid_search([True, False]),

        # "env": "FearGreedMatrix",
        "env": "CoinGame",
        "normalize_against_vp": False,
        "normalize_against_v": False,
        "normalize_vp_separated": False,
    }

    hyperparameters = add_env_hp(hyperparameters)

    train(hyperparameters)
Exemplo n.º 18
0
def get_hyperparameters(debug, env):
    exp_name, _ = log.log_in_current_day_dir("L1BR_amTFT")

    train_n_replicates = 4 if debug else 8
    pool_of_seeds = miscellaneous.get_random_seeds(train_n_replicates)
    hparams = {
        "debug": debug,
        "filter_utilitarian": False,
        "train_n_replicates": train_n_replicates,
        "seeds": pool_of_seeds,
        "exp_name": exp_name,
        "welfare_functions":
        [(postprocessing.WELFARE_UTILITARIAN, "utilitarian")],
        "amTFTPolicy": amTFT.AmTFTRolloutsTorchPolicy,
        "explore_during_evaluation": True,
        "n_seeds_lvl0": train_n_replicates,
        "n_seeds_lvl1": train_n_replicates // 2,
        "gamma": 0.96,
        "temperature_schedule": False,
        "jitter": 0.05,
        "hiddens": [64],
        "env_name": "IteratedPrisonersDilemma",
        # "env_name": "IteratedAsymBoS",
        # "env_name": "IteratedAsymChicken",
        # "env_name": "CoinGame",
        # "env_name": "AsymCoinGame",
        "overwrite_reward": True,
        "reward_uncertainty": 0.0,
    }

    if env is not None:
        hparams["env_name"] = env

    hparams = amtft_various_env.modify_hyperparams_for_the_selected_env(
        hparams)

    return hparams, exp_name
Exemplo n.º 19
0
def main(debug):
    train_n_replicates = 1 if debug else 1
    seeds = miscellaneous.get_random_seeds(train_n_replicates)
    exp_name, _ = log.log_in_current_day_dir("LTFT_IPD")

    hparameters = {
        "n_epi": 10 if debug else 200,
        "n_steps_per_epi": 20,
        "bs_epi_mul": 4,
        "base_lr": 0.04,
        "spl_lr_mul": 10.0,
        "seeds": seeds,
        "debug": debug,
    }

    rllib_config, env_config, stop = get_rllib_config(hparameters)
    ray.init(num_cpus=os.cpu_count(), num_gpus=0)
    print("\n========== Training LTFT in self-play ==========\n")
    tune_analysis_self_play = ray.tune.run(DQNTrainer, config=rllib_config,
                           verbose=1, checkpoint_freq=0, stop=stop,
                           checkpoint_at_end=True, name=exp_name)

    print("\n========== Training LTFT against a naive opponent ==========\n")
    # Set player_col to use a naive policy
    rllib_config["multiagent"]["policies"][env_config["players_ids"][1]] = (
        None,
        IteratedPrisonersDilemma.OBSERVATION_SPACE,
        IteratedPrisonersDilemma.ACTION_SPACE,
        {}
    )
    tune_analysis_naive_opponent = ray.tune.run(DQNTrainer, config=rllib_config,
                           verbose=1, checkpoint_freq=0, stop=stop,
                           checkpoint_at_end=True, name=exp_name)

    ray.shutdown()
    return tune_analysis_self_play, tune_analysis_naive_opponent
Exemplo n.º 20
0
def main(debug):
    n_in_lvl0_population = 2 if debug else 40
    n_lvl1 = 1 if debug else 1
    timestamp = int(time.time())
    lvl0_seeds = [
        seed + timestamp for seed in list(range(n_in_lvl0_population))
    ]
    lvl1_seeds = list(range(n_lvl1))

    exp_name, _ = log.log_in_current_day_dir("L1BR_LOLA_PG")

    tune_hparams = {
        "exp_name": exp_name,
        "load_data": None,
        # Example: "load_data": ".../lvl1_results.p",
        "load_population": None,
        # Example: "load_population":
        # [".../checkpoint.json", ".../checkpoint.json", ...]
        "num_episodes": 5 if debug else 2000,
        "trace_length": 5 if debug else 20,
        "lr": None,
        "gamma": 0.5,
        "batch_size": 5 if debug else 512,
        # "env_name": "IteratedPrisonersDilemma",
        # "env_name": "IteratedBoS",
        # "env_name": "IteratedAsymBoS",
        "env_name": "VectorizedCoinGame",
        # "env_name": "AsymVectorizedCoinGame",
        "pseudo": False,
        "grid_size": 3,
        "lola_update": True,
        "opp_model": False,
        "mem_efficient": True,
        "lr_correction": 1,
        "bs_mul": 1 / 10,
        "simple_net": True,
        "hidden": 32,
        "reg": 0,
        "set_zero": 0,
        "exact": False,
        "warmup": 1,
        "lvl0_seeds": lvl0_seeds,
        "lvl1_seeds": lvl1_seeds,
        "changed_config": False,
        "ac_lr": 1.0,
        "summary_len": 1,
        "use_MAE": False,
        "use_toolbox_env": True,
        "clip_loss_norm": False,
        "clip_lola_update_norm": False,
        "clip_lola_correction_norm": 3.0,
        "clip_lola_actor_norm": 10.0,
        "entropy_coeff": 0.001,
        "weigth_decay": 0.03,
        "lola_correction_multiplier": 1,
        "lr_decay": True,
        "correction_reward_baseline_per_step": False,
        "use_critic": False,
    }

    rllib_hparams = {
        "debug": debug,
        "n_steps_per_epi": 20,
        "bs_epi_mul": 4,
        "sgd_momentum": 0.9,
        "temperature_schedule": False,
    }

    if tune_hparams["load_data"] is None:
        ray.init(num_cpus=os.cpu_count(), num_gpus=0)

        # # Train
        if tune_hparams["load_population"] is None:
            results_list_lvl0 = train_lvl0_population(tune_hp=tune_hparams)
            log.save_metrics(results_list_lvl0, exp_name, "lvl0_results.p")
        else:
            results_list_lvl0 = []

        results_list_lvl1 = train_lvl1_agents(
            tune_hp=tune_hparams,
            rllib_hp=rllib_hparams,
            results_list_lvl0=results_list_lvl0,
        )
        log.save_metrics(results_list_lvl1,
                         exp_name,
                         "lvl1_results.p",
                         limit=True)

        ray.shutdown()
    else:
        # TODO print that every time, not only when loading
        log.pprint_saved_metrics(
            tune_hparams["load_data"],
            keywords_to_print=[
                "policy_reward_mean",
                "speed.*mean",
                "own.*mean",
                "analysis",
                "^avg$",
                "last-10-avg",
            ],
        )
Exemplo n.º 21
0
def main(debug: bool, env=None):
    """
    Train several LOLA_PG pairs of agent on the selected environment and
    plot their performances in self-play and cross-play.

    :param debug: selection of debug mode using less compute
    :param env: option to overwrite the env selection
    """
    train_n_replicates = 2 if debug else 1
    timestamp = int(time.time())
    seeds = [seed + timestamp for seed in list(range(train_n_replicates))]

    exp_name, _ = log.log_in_current_day_dir("LOLA_PG")

    # The InfluenceEvader(like)
    use_best_exploiter = False
    # use_best_exploiter = True

    high_coop_speed_hp = True if use_best_exploiter else False
    # high_coop_speed_hp = True

    tune_hparams = {
        "debug":
        debug,
        "exp_name":
        exp_name,
        "train_n_replicates":
        train_n_replicates,
        # wandb configuration
        "wandb":
        None if debug else {
            "project":
            "LOLA_PG",
            "group":
            exp_name,
            "api_key_file":
            os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"),
            "log_config":
            True,
        },
        # Print metrics
        "load_plot_data":
        None,
        # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p",
        #
        # "gamma": 0.5,
        # "num_episodes": 3 if debug else 4000 if high_coop_speed_hp else 2000,
        # "trace_length": 4 if debug else 20,
        # "lr": None,
        #
        # "gamma": 0.875,
        # "lr": 0.005 / 4,
        # "num_episodes": 3 if debug else 4000,
        # "trace_length": 4 if debug else 20,
        #
        "gamma":
        0.9375,
        "lr":
        0.005 / 4 if debug else tune.grid_search(
            [0.005 / 4, 0.005 / 4 / 2, 0.005 / 4 / 2 / 2]),
        "num_episodes":
        3 if debug else tune.grid_search([4000, 8000]),
        "trace_length":
        4 if debug else tune.grid_search([40, 80]),
        #
        "batch_size":
        8 if debug else 512,
        # "env_name": "IteratedPrisonersDilemma" if env is None else env,
        # "env_name": "IteratedAsymBoS" if env is None else env,
        "env_name":
        "VectorizedCoinGame" if env is None else env,
        # "env_name": "AsymVectorizedCoinGame" if env is None else env,
        # "env_name": "VectorizedMixedMotiveCoinGame" if env is None else env,
        "pseudo":
        False,
        "grid_size":
        3,
        "lola_update":
        True,
        "opp_model":
        False,
        "mem_efficient":
        True,
        "lr_correction":
        1,
        "bs_mul":
        1 / 10 * 3 if use_best_exploiter else 1 / 10,
        "simple_net":
        True,
        "hidden":
        32,
        "reg":
        0,
        "set_zero":
        0,
        "exact":
        False,
        "warmup":
        1,
        "seed":
        tune.grid_search(seeds),
        "changed_config":
        False,
        "ac_lr":
        1.0,
        "summary_len":
        1,
        "use_MAE":
        False,
        "use_toolbox_env":
        True,
        "clip_loss_norm":
        False,
        "clip_lola_update_norm":
        False,
        "clip_lola_correction_norm":
        3.0,
        # "clip_lola_correction_norm":
        # tune.grid_search([3.0 / 2, 3.0, 3.0 * 2]),
        "clip_lola_actor_norm":
        10.0,
        # "clip_lola_actor_norm": tune.grid_search([10.0 / 2, 10.0, 10.0 * 2]),
        "entropy_coeff":
        0.001,
        # "entropy_coeff": tune.grid_search([0.001/2/2, 0.001/2, 0.001]),
        # "weigth_decay": 0.03,
        "weigth_decay":
        0.03 if debug else tune.grid_search(
            [0.03 / 8 / 2 / 2, 0.03 / 8 / 2, 0.03 / 8]),
        # "lola_correction_multiplier": 1,
        "lola_correction_multiplier":
        1 if debug else tune.grid_search([1 * 4, 1 * 4 * 2, 1 * 4 * 2 * 2]),
        "lr_decay":
        True,
        "correction_reward_baseline_per_step":
        False,
        "use_critic":
        False,
        "plot_keys": [
            "reward",
            "total_reward",
            "entrop",
        ],
        "plot_assemblage_tags": [
            ("total_reward", ),
            ("entrop", ),
        ],
    }

    # Add exploiter hyperparameters
    tune_hparams.update({
        "start_using_exploiter_at_update_n":
        1 if debug else 3000 if high_coop_speed_hp else 1500,
        # PG exploiter
        "use_PG_exploiter":
        True if use_best_exploiter else False,
        "every_n_updates_copy_weights":
        1 if debug else 100,
        # "adding_scaled_weights": False,
        # "adding_scaled_weights": 0.33,
    })

    if tune_hparams["load_plot_data"] is None:
        ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)
        tune_analysis_per_exp = _train(tune_hparams)
    else:
        tune_analysis_per_exp = None

    _evaluate(tune_hparams, debug, tune_analysis_per_exp)
    ray.shutdown()
Exemplo n.º 22
0

#         del config['match_mode']
#         del config['cond_params']
        super().setup(config)

if __name__ == '__main__':
    # each logical step of training contains several episodes, each episode is a batch of games
    training_steps = config['training_episodes'] // config['episodes_per_step']
    print(f'Num of training steps: {training_steps}')
    print(f'Episodes per step: {config["episodes_per_step"]}')
    # arbitrator in population training hasn't been tested
    assert not (config['enable_arbitrator']
                and isinstance(config['hidden_embedding_sizes'][0], list))

    exp_name_expanded, exp_dir = log.log_in_current_day_dir(config['name'])

    analysis = tune.run(
        name=exp_name_expanded,
        run_or_experiment=CrossPlayTraining,
        stop={"training_iteration": training_steps},
        config=config,
        checkpoint_freq=1000,
        checkpoint_at_end=True,
        metric='prosocial_reward',
        mode='max',
    )

    log.save_metrics(analysis, exp_name_expanded, "metrics.pickle")
    # log.pprint_saved_metrics(os.path.join(os.path.expanduser('~/ray_results'), exp_name_expanded, 'metrics.pickle'))
Exemplo n.º 23
0
def init_worker(
    n_rollout_replicas,
    max_steps,
    actions_list_0=None,
    actions_list_1=None,
    actions_list_2=None,
    actions_list_3=None,
):
    train_n_replicates = 1
    debug = True
    exp_name, _ = log.log_in_current_day_dir("testing")

    hparams = get_hyperparameters(
        debug,
        train_n_replicates,
        filter_utilitarian=False,
        env="IteratedPrisonersDilemma",
    )

    _, _, rllib_config = get_rllib_config(
        hparams, welfare_fn=postprocessing.WELFARE_UTILITARIAN
    )

    rllib_config["env"] = FakeEnvWtActionAsReward
    rllib_config["env_config"]["max_steps"] = max_steps
    rllib_config["seed"] = int(time.time())
    for policy_id in FakeEnvWtActionAsReward({}).players_ids:
        policy_to_modify = list(
            rllib_config["multiagent"]["policies"][policy_id]
        )
        policy_to_modify[3]["rollout_length"] = max_steps
        policy_to_modify[3]["n_rollout_replicas"] = n_rollout_replicas
        policy_to_modify[3]["verbose"] = 1
        if actions_list_0 is not None:
            policy_to_modify[3]["nested_policies"][0][
                "Policy_class"
            ] = make_FakePolicyWtDefinedActions(
                copy.deepcopy(actions_list_0), DEFAULT_NESTED_POLICY_COOP
            )
        if actions_list_1 is not None:
            policy_to_modify[3]["nested_policies"][1][
                "Policy_class"
            ] = make_FakePolicyWtDefinedActions(
                copy.deepcopy(actions_list_1), DEFAULT_NESTED_POLICY_SELFISH
            )
        if actions_list_2 is not None:
            policy_to_modify[3]["nested_policies"][2][
                "Policy_class"
            ] = make_FakePolicyWtDefinedActions(
                copy.deepcopy(actions_list_2), DEFAULT_NESTED_POLICY_COOP
            )
        if actions_list_3 is not None:
            policy_to_modify[3]["nested_policies"][3][
                "Policy_class"
            ] = make_FakePolicyWtDefinedActions(
                copy.deepcopy(actions_list_3), DEFAULT_NESTED_POLICY_SELFISH
            )
        rllib_config["multiagent"]["policies"][policy_id] = tuple(
            policy_to_modify
        )

    dqn_trainer = DQNTrainer(
        rllib_config, logger_creator=_get_logger_creator(exp_name)
    )
    worker = dqn_trainer.workers._local_worker

    am_tft_policy_row = worker.get_policy("player_row")
    am_tft_policy_col = worker.get_policy("player_col")
    am_tft_policy_row.working_state = WORKING_STATES[2]
    am_tft_policy_col.working_state = WORKING_STATES[2]

    return worker, am_tft_policy_row, am_tft_policy_col
Exemplo n.º 24
0
def main(debug):
    exp_name, _ = log.log_in_current_day_dir("InequityAversion")

    ray.init(num_cpus=os.cpu_count(), num_gpus=0)

    stop = {"episodes_total": 10 if debug else 400}

    env_config = {
        "max_steps": 10,
        "players_ids": ["player_row", "player_col"],
    }

    policies = {
        env_config["players_ids"][0]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {}),
        env_config["players_ids"][1]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {})
    }

    rllib_config = {
        "env":
        IteratedBoSAndPD,
        "env_config":
        env_config,
        "num_gpus":
        0,
        "num_workers":
        1,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: agent_id),
        },
        "framework":
        "torch",
        "gamma":
        0.5,
        "callbacks":
        callbacks.merge_callbacks(
            log.get_logging_callbacks_class(),
            postprocessing.OverwriteRewardWtWelfareCallback),
    }

    MyPGTorchPolicy = PGTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.welfares_postprocessing_fn(
                add_inequity_aversion_welfare=True,
                inequity_aversion_beta=1.0,
                inequity_aversion_alpha=0.0,
                inequity_aversion_gamma=1.0,
                inequity_aversion_lambda=0.5),
            pg_torch_policy.post_process_advantages))
    MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy,
                                         get_policy_class=None)
    tune_analysis = tune.run(MyPGTrainer,
                             stop=stop,
                             checkpoint_freq=10,
                             config=rllib_config,
                             name=exp_name)
    ray.shutdown()
    return tune_analysis
Exemplo n.º 25
0
def main(debug, train_n_replicates=None, filter_utilitarian=None):

    train_n_replicates = 1 if debug else train_n_replicates
    train_n_replicates = 40 if train_n_replicates is None else train_n_replicates
    n_times_more_utilitarians_seeds = 4
    pool_of_seeds = miscellaneous.get_random_seeds(
        train_n_replicates * (1 + n_times_more_utilitarians_seeds))
    exp_name, _ = log.log_in_current_day_dir("amTFT")
    hparams = {
        "debug":
        debug,
        "filter_utilitarian":
        filter_utilitarian if filter_utilitarian is not None else not debug,
        "train_n_replicates":
        train_n_replicates,
        "n_times_more_utilitarians_seeds":
        n_times_more_utilitarians_seeds,
        "load_plot_data":
        None,
        # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p",
        "exp_name":
        exp_name,
        "n_steps_per_epi":
        20,
        "bs_epi_mul":
        4,
        "welfare_functions":
        [(postprocessing.WELFARE_INEQUITY_AVERSION, "inequity_aversion"),
         (postprocessing.WELFARE_UTILITARIAN, "utilitarian")],
        "seeds":
        pool_of_seeds,
        "amTFTPolicy":
        amTFT.amTFTRolloutsTorchPolicy,
        "explore_during_evaluation":
        True,
        "gamma":
        0.5,
        "lambda":
        0.9,
        "alpha":
        0.0,
        "beta":
        1.0,
        "temperature_schedule":
        False,
        "debit_threshold":
        4.0,
        "jitter":
        0.05,
        "hiddens": [64],

        # If not in self play then amTFT will be evaluated against a naive selfish policy
        "self_play":
        True,
        # "self_play": False, # Not tested

        # "env": matrix_sequential_social_dilemma.IteratedPrisonersDilemma,
        # "utilitarian_filtering_threshold": -2.5,
        "env":
        matrix_sequential_social_dilemma.IteratedAsymBoS,
        "utilitarian_filtering_threshold":
        3.2,
        # "env": matrix_sequential_social_dilemma.IteratedAsymChicken,
        # "utilitarian_filtering_threshold": ...,
        # "env": coin_game.CoinGame
        # "env": coin_game.AsymCoinGame
        # "utilitarian_filtering_threshold": ...,

        # For training speed
        "min_iter_time_s":
        0.0 if debug else 3.0,
        "overwrite_reward":
        True,
        "use_adam":
        False,
    }
    hparams = modify_hyperparams_for_the_selected_env(hparams)

    if hparams["load_plot_data"] is None:
        ray.init(num_cpus=os.cpu_count(),
                 num_gpus=0,
                 local_mode=hparams["debug"])

        # Train
        tune_analysis_per_welfare = train_for_each_welfare_function(hparams)
        # Eval & Plot
        analysis_metrics_per_mode = evaluate_self_and_cross_perf(
            tune_analysis_per_welfare, hparams)

        ray.shutdown()
    else:
        tune_analysis_per_welfare = None
        # Plot
        analysis_metrics_per_mode = evaluate_self_and_cross_perf(
            tune_analysis_per_welfare, hparams)

    return tune_analysis_per_welfare, analysis_metrics_per_mode
Exemplo n.º 26
0
def env_name():
    exp_name, _ = log.log_in_current_day_dir("testing")
    return exp_name
Exemplo n.º 27
0
def main(debug):
    train_n_replicates = 2 if debug else 40
    timestamp = int(time.time())
    seeds = [seed + timestamp for seed in list(range(train_n_replicates))]

    exp_name, _ = log.log_in_current_day_dir("LOLA_PG")

    use_best_exploiter = False
    # use_best_exploiter = True

    high_coop_speed_hp = True if use_best_exploiter else False
    # high_coop_speed_hp = True

    tune_hparams = {
        "debug": debug,
        "exp_name": exp_name,
        "train_n_replicates": train_n_replicates,

        # Print metrics
        "load_plot_data": None,
        # Example: "load_plot_data": ".../SameAndCrossPlay_save.p",

        # Dynamically set
        "num_episodes": 3 if debug else 4000 if high_coop_speed_hp else 2000,
        # "num_episodes": tune.grid_search([2000, 4000, 6000]),
        "trace_length": 4 if debug else 20,
        "lr": None,
        "gamma": 0.5,
        "batch_size": 8 if debug else 512,

        # "env": IteratedPrisonersDilemma,
        # "env": IteratedBoS,
        # "env": IteratedAsymBoS,
        "env": CoinGame,
        # "env": AsymCoinGame,
        "pseudo": False,
        "grid_size": 3,
        "lola_update": True,
        "opp_model": False,
        "mem_efficient": True,
        "lr_correction": 1,
        "bs_mul": 1 / 10 * 3 if use_best_exploiter else 1 / 10,
        "simple_net": True,
        "hidden": 32,
        "reg": 0,
        "set_zero": 0,
        "exact": False,
        "warmup": 1,
        "seed": tune.grid_search(seeds),
        "changed_config": False,
        "ac_lr": 1.0,
        "summary_len": 1,
        "use_MAE": False,

        # "use_toolbox_env": True,
        "clip_loss_norm": False,
        "clip_lola_update_norm": False,
        "clip_lola_correction_norm": 3.0,
        "clip_lola_actor_norm": 10.0,
        "entropy_coeff": 0.001,
        "weigth_decay": 0.03,
        "lola_correction_multiplier": 1,
        # "lola_correction_multiplier": tune.grid_search([1, 0.75, 0.5, 0.25]),
        "lr_decay": True,
        "correction_reward_baseline_per_step": False,
        "use_critic": False,
    }

    # Add exploiter hyperparameters
    tune_hparams.update({
        "playing_against_exploiter":
        False,
        # "playing_against_exploiter": True,
        "start_using_exploiter_at_update_n":
        1 if debug else 3000 if high_coop_speed_hp else 1500,
        # "use_exploiter_on_fraction_of_batch": 0.5 if debug else 1.0,
        "use_exploiter_on_fraction_of_batch":
        0.5 if debug else 0.1,

        # DQN exploiter
        "use_DQN_exploiter":
        False,
        # "use_DQN_exploiter": True,
        "train_exploiter_n_times_per_epi":
        3,
        "exploiter_base_lr":
        0.1,
        "exploiter_decay_lr_in_n_epi":
        3000 if high_coop_speed_hp else 1500,
        "exploiter_stop_training_after_n_epi":
        3000 if high_coop_speed_hp else 1500,
        "exploiter_rolling_avg":
        0.9,
        "always_train_PG":
        True,
        # (is not None) DQN exploiter use thresholds on opp cooperation to switch between policies
        # otherwise the DQN exploiter will use the best policy (from simulated reward)
        # "exploiter_thresholds": None,
        "exploiter_thresholds": [0.6, 0.7] if debug else [0.80, 0.95],

        # PG exploiter
        # "use_PG_exploiter": False,
        "use_PG_exploiter":
        True if use_best_exploiter else False,
        "every_n_updates_copy_weights":
        1 if debug else 100,
        "adding_scaled_weights":
        False,
        # "adding_scaled_weights": 0.33,

        # Destabilizer exploiter
        "use_destabilizer":
        True,
        # "use_destabilizer": False,
    })

    if tune_hparams["load_plot_data"] is None:
        ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug)
        tune_analysis_per_exp = train(tune_hparams)
    else:
        tune_analysis_per_exp = None

    evaluate(tune_hparams, debug, tune_analysis_per_exp)
    ray.shutdown()
Exemplo n.º 28
0
def get_hyperparameters(
    debug,
    train_n_replicates=None,
    filter_utilitarian=None,
    env=None,
    reward_uncertainty=0.0,
):
    if debug:
        train_n_replicates = 2
        n_times_more_utilitarians_seeds = 1
    elif train_n_replicates is None:
        n_times_more_utilitarians_seeds = 4
        train_n_replicates = 4
    else:
        n_times_more_utilitarians_seeds = 4

    n_seeds_to_prepare = train_n_replicates * (1 +
                                               n_times_more_utilitarians_seeds)
    pool_of_seeds = miscellaneous.get_random_seeds(n_seeds_to_prepare)
    exp_name, _ = log.log_in_current_day_dir("amTFT")
    hparams = {
        "debug":
        debug,
        "filter_utilitarian":
        filter_utilitarian if filter_utilitarian is not None else not debug,
        "seeds":
        pool_of_seeds,
        "train_n_replicates":
        train_n_replicates,
        "n_times_more_utilitarians_seeds":
        n_times_more_utilitarians_seeds,
        "exp_name":
        exp_name,
        "log_n_points":
        250,
        "load_plot_data":
        None,
        # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p",
        "load_policy_data":
        None,
        # "load_policy_data": {
        #     "Util": [
        #         ".../IBP/amTFT/trials/"
        #         "DQN_AsymCoinGame_...",
        #         ".../IBP/amTFT/trials/"
        #         "DQN_AsymCoinGame_..."],
        #     'IA':[
        #         ".../temp/IBP/amTFT/trials/"
        #         "DQN_AsymCoinGame_...",
        #         ".../IBP/amTFT/trials/"
        #         "DQN_AsymCoinGame_..."],
        # },
        # "load_policy_data": {
        #     "Util": [
        #         "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT"
        #         "/2021_03_28/19_38_55/utilitarian_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_06231_00000_0_seed=1616960338_2021-03-29_00-52-23/checkpoint_250/checkpoint-250",
        #         # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT"
        #         # "/2021_03_24/18_22_47/utilitarian_welfare/coop"
        #         # "/DQN_VectMixedMotiveCG_e1de7_00001_1_seed=1616610171_2021-03-25_00-27-29/checkpoint_250/checkpoint-250",
        #         # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT"
        #         # "/2021_03_24/18_22_47/utilitarian_welfare/coop"
        #         # "/DQN_VectMixedMotiveCG_e1de7_00002_2_seed=1616610172_2021-03-25_00-27-29/checkpoint_250/checkpoint-250",
        #         ],
        #     'IA':[
        #         "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible"
        #         "/amTFT/2021_03_28/19_38_55/inequity_aversion_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_d5a2a_00000_0_seed=1616960335_2021-03-28_21-23-26/checkpoint_250/checkpoint-250",
        #         # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible"
        #         # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop"
        #         # "/DQN_VectMixedMotiveCG_9cfe6_00001_1_seed=1616610168_2021-03-24_20-22-11/checkpoint_250/checkpoint-250",
        #         # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible"
        #         # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop"
        #         # "/DQN_VectMixedMotiveCG_9cfe6_00002_2_seed=1616610169_2021-03-24_20-22-11/checkpoint_250/checkpoint-250",
        #         ],
        # },
        # "load_policy_data": {
        #     "Util": [
        #         "~/ray_results/amTFT"
        #         "/2021_03_24/18_22_47/utilitarian_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_e1de7_00000_0_seed=1616610170_2021-03-25_00-27-29/checkpoint_250/checkpoint-250",
        #         "~/ray_results/amTFT"
        #         "/2021_03_24/18_22_47/utilitarian_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_e1de7_00001_1_seed=1616610171_2021-03-25_00-27-29/checkpoint_250/checkpoint-250",
        #         "~/ray_results/amTFT"
        #         "/2021_03_24/18_22_47/utilitarian_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_e1de7_00002_2_seed=1616610172_2021-03-25_00-27-29/checkpoint_250/checkpoint-250",
        #     ],
        #     'IA': [
        #         "~/ray_results"
        #         "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_9cfe6_00000_0_seed=1616610167_2021-03-24_20-22-10/checkpoint_250/checkpoint-250",
        #         "~/ray_results"
        #         "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_9cfe6_00001_1_seed=1616610168_2021-03-24_20-22-11/checkpoint_250/checkpoint-250",
        #         "~/ray_results"
        #         "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop"
        #         "/DQN_VectMixedMotiveCG_9cfe6_00002_2_seed=1616610169_2021-03-24_20-22-11/checkpoint_250/checkpoint-250",
        #     ],
        # },
        "amTFTPolicy":
        amTFT.AmTFTRolloutsTorchPolicy,
        "welfare_functions": [
            (postprocessing.WELFARE_INEQUITY_AVERSION, "inequity_aversion"),
            (postprocessing.WELFARE_UTILITARIAN, "utilitarian"),
        ],
        "jitter":
        0.05,
        "hiddens": [64],
        "gamma":
        0.96,
        # If not in self play then amTFT
        # will be evaluated against a naive selfish policy or an exploiter
        "self_play":
        True,
        # "self_play": False, # Not tested
        "env_name":
        "IteratedPrisonersDilemma" if env is None else env,
        # "env_name": "IteratedAsymBoS" if env is None else env,
        # "env_name": "CoinGame" if env is None else env,
        # "env_name": "AsymCoinGame" if env is None else env,
        # "env_name": "MixedMotiveCoinGame" if env is None else env,
        # "env_name": "SSDMixedMotiveCoinGame" if env is None else env,
        "overwrite_reward":
        True,
        "explore_during_evaluation":
        True,
        "reward_uncertainty":
        reward_uncertainty,
    }

    hparams = modify_hyperparams_for_the_selected_env(hparams)
    hparams["plot_keys"] = amTFT.PLOT_KEYS + hparams["plot_keys"]
    hparams["plot_assemblage_tags"] = (amTFT.PLOT_ASSEMBLAGE_TAGS +
                                       hparams["plot_assemblage_tags"])

    return hparams