def main(debug): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("DQN_CG_speed_search") env = "CoinGame" # env = "SSDMixedMotiveCoinGame" # welfare_to_use = None # welfare_to_use = postprocessing.WELFARE_UTILITARIAN welfare_to_use = postprocessing.WELFARE_INEQUITY_AVERSION if "SSDMixedMotiveCoinGame" in env: env_class = ssd_mixed_motive_coin_game.SSDMixedMotiveCoinGame else: env_class = coin_game.CoinGame hparams = _get_hyperparameters(seeds, debug, exp_name) rllib_config, stop_config = _get_rllib_configs( hparams, env_class=env_class ) if welfare_to_use is not None: rllib_config = _modify_policy_to_use_welfare( rllib_config, welfare_to_use ) rllib_config, stop_config = _add_search_to_config( rllib_config, stop_config, hparams ) tune_analysis = _train_dqn_and_plot_logs( hparams, rllib_config, stop_config ) return tune_analysis
def _get_hyparameters(debug, env=None, train_n_replicates=None, against_naive_opp=False): if debug: train_n_replicates = 1 elif train_n_replicates is None: train_n_replicates = 4 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("LTFT") hparameters = { "seeds": seeds, "debug": debug, "exp_name": exp_name, "hiddens": [64], "log_n_points": 260, "clustering_distance": 0.2, "gamma": 0.96, "env_name": "IteratedPrisonersDilemma" if env is None else env, # "env_name": "CoinGame" if env is None else env, "reward_uncertainty_std": 0.1, # "against_evader_exploiter": None, "against_evader_exploiter": { "start_exploit": 0.75, "copy_weights_delay": 0.05, } if not against_naive_opp else None, } hparameters = _modify_hyperparams_for_the_selected_env(hparameters) return hparameters, exp_name
def init_worker(actions_list=None): train_n_replicates = 1 debug = True stop_iters = 200 tf = False seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("testing") rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf) rllib_config["env"] = FakeEnvWtCstReward rllib_config["env_config"]["max_steps"] = EPI_LENGTH rllib_config["seed"] = int(time.time()) if actions_list is not None: for policy_id in FakeEnvWtCstReward({}).players_ids: policy_to_modify = list( rllib_config["multiagent"]["policies"][policy_id] ) policy_to_modify[0] = make_FakePolicyWtDefinedActions( copy.deepcopy(actions_list) ) rllib_config["multiagent"]["policies"][ policy_id ] = policy_to_modify pg_trainer = PGTrainer( rllib_config, logger_creator=_get_logger_creator(exp_name) ) return pg_trainer.workers._local_worker
def __init__( self, exp_name: str, local_mode: bool = False, use_random_policy_from_own_checkpoint: bool = False, use_wandb: bool = False, ): """ You should take a look at examples using this class. Any training is deactivated here. Only the worker rollout will evaluate your policy on the environment. Any exploration is deactivated. Works for a unique pair of RLLib policies. :param exp_name: Normal exp_name argument provided to tune.run(). :param use_random_policy_from_own_checkpoint: (optional, default to False) """ self.default_selected_order = 0 self.running_in_local_mode = local_mode self.use_wandb = use_wandb self.exp_name, self.exp_parent_dir = log.log_in_current_day_dir( exp_name) self.results_file_name = "SelfAndCrossPlay_save.p" self.save_path = os.path.join(self.exp_parent_dir, self.results_file_name) # TODO this var name is not clear enough self.use_random_policy_from_own_checkpoint = ( use_random_policy_from_own_checkpoint) self.experiment_defined = False self.checkpoints_loaded = False
def main(debug, stop_iters=2000, tf=False): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("PPO_AsymCG") ray.init() stop = { "training_iteration": 2 if debug else stop_iters, } env_config = { "players_ids": ["player_red", "player_blue"], "max_steps": 20, "grid_size": 3, "get_additional_info": True, } rllib_config = { "env": AsymCoinGame, "env_config": env_config, "multiagent": { "policies": { env_config["players_ids"][0]: (None, AsymCoinGame(env_config).OBSERVATION_SPACE, AsymCoinGame.ACTION_SPACE, {}), env_config["players_ids"][1]: (None, AsymCoinGame(env_config).OBSERVATION_SPACE, AsymCoinGame.ACTION_SPACE, {}), }, "policy_mapping_fn": lambda agent_id: agent_id, }, # Size of batches collected from each worker. "rollout_fragment_length": 20, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": 512, "model": { "dim": env_config["grid_size"], "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]] # [Channel, [Kernel, Kernel], Stride]] }, "lr": 5e-3, "seed": tune.grid_search(seeds), "callbacks": log.get_logging_callbacks_class(), "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "tf" if tf else "torch", } tune_analysis = tune.run(PPOTrainer, config=rllib_config, stop=stop, checkpoint_freq=0, checkpoint_at_end=True, name=exp_name) ray.shutdown() return tune_analysis
def main(debug): train_n_replicates = 2 if debug else 40 timestamp = int(time.time()) seeds = [seed + timestamp for seed in list(range(train_n_replicates))] exp_name, _ = log.log_in_current_day_dir("LOLA_DICE") hparams = { "load_plot_data": None, # IPD # Example: "load_plot_data": ".../SameAndCrossPlay_save.p", "exp_name": exp_name, "train_n_replicates": train_n_replicates, "env": "IPD", # "env": "IMP", # "env": "AsymBoS", # "env": "CoinGame", # "env": "AsymCoinGame", "gamma": None, "trace_length": 10 if debug else None, "epochs": 2 if debug else 200, "lr_inner": .1, "lr_outer": .2, "lr_value": .1, "lr_om": .1, "inner_asymm": True, "n_agents": 2, "n_inner_steps": 1 if debug else 2, "batch_size": 10 if debug else 64, "value_batch_size": 16, "value_epochs": 0, "om_batch_size": 16, "om_epochs": 0, "grid_size": 3, "use_baseline": False, "use_dice": True, "use_opp_modeling": False, "seed": tune.grid_search(seeds), "metric": "ag_0_returns_player_1", } if hparams["load_plot_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) training_results = train(hparams) else: training_results = None evaluate(training_results, hparams, debug) ray.shutdown()
def main(debug, stop_iters=200, tf=False): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("PG_IPD") ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf) tune_analysis = tune.run(PGTrainer, config=rllib_config, stop=stop_config, checkpoint_freq=0, checkpoint_at_end=True, name=exp_name) ray.shutdown() return tune_analysis
def main(debug): exp_name, _ = log.log_in_current_day_dir(f"LOLA_PG") tune_hparams = { "exp_name": exp_name, # Dynamically set "num_episodes": 3 if debug else None, "trace_length": 6 if debug else None, "lr": None, "gamma": None, "batch_size": 12 if debug else None, # "exp_name": "IPD", # "exp_name": "IMP", "exp_name": "CoinGame", # "exp_name": "AsymCoinGame", "pseudo": False, "grid_size": 3, "lola_update": True, "opp_model": False, "mem_efficient": True, "lr_correction": 1, "bs_mul": 1 / 10, "simple_net": True, "hidden": 32, "reg": 0, "set_zero": 0, "exact": False, "warmup": 1, "seed": 1, "changed_config": False, "ac_lr": 1.0, "summary_len": 1, "use_MAE": False, "use_toolbox_env": True, "clip_loss_norm": False, "clip_lola_update_norm": False, "clip_lola_correction_norm": 3.0, "clip_lola_actor_norm": 10.0, "entropy_coeff": 0.001, "weigth_decay": 0.03, } tune_config = get_tune_config(tune_hparams) ray.init(num_cpus=os.cpu_count(), num_gpus=0) tune_analysis = tune.run(lola_training, name=tune_hparams["exp_name"], config=tune_config) ray.shutdown() return tune_analysis
def main(debug, welfare=postprocessing.WELFARE_UTILITARIAN): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("DQN_welfare_CG") hparams = dqn_coin_game._get_hyperparameters(seeds, debug, exp_name) rllib_config, stop_config = dqn_coin_game._get_rllib_configs(hparams) rllib_config = _modify_policy_to_use_welfare(rllib_config, welfare) tune_analysis = dqn_coin_game._train_dqn_and_plot_logs( hparams, rllib_config, stop_config) return tune_analysis
def main(debug): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("DQN_CG") hparams = _get_hyperparameters(seeds, debug, exp_name) rllib_config, stop_config = _get_rllib_configs(hparams) tune_analysis = _train_dqn_and_plot_logs( hparams, rllib_config, stop_config) return tune_analysis
def main(debug): exp_name, _ = log.log_in_current_day_dir("L1BR_amTFT") train_n_replicates = 4 if debug else 8 pool_of_seeds = miscellaneous.get_random_seeds(train_n_replicates) hparams = { "debug": debug, "filter_utilitarian": False, "train_n_replicates": train_n_replicates, "seeds": pool_of_seeds, "exp_name": exp_name, "n_steps_per_epi": 20, "bs_epi_mul": 4, "welfare_functions": [(postprocessing.WELFARE_UTILITARIAN, "utilitarian")], "amTFTPolicy": amTFT.amTFTRolloutsTorchPolicy, "explore_during_evaluation": True, "n_seeds_lvl0": train_n_replicates, "n_seeds_lvl1": train_n_replicates // 2, "gamma": 0.5, "lambda": 0.9, "alpha": 0.0, "beta": 1.0, "temperature_schedule": False, "debit_threshold": 4.0, "jitter": 0.05, "hiddens": [64], "env": matrix_sequential_social_dilemma.IteratedPrisonersDilemma, # "env": matrix_sequential_social_dilemma.IteratedAsymBoS, # "env": matrix_sequential_social_dilemma.IteratedAsymChicken, # "env": coin_game.CoinGame # "env": coin_game.AsymCoinGame # For training speed "min_iter_time_s": 0.0 if debug else 3.0, "overwrite_reward": True, "use_adam": False, } ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=hparams["debug"]) hparams = amtft_various_env.modify_hyperparams_for_the_selected_env( hparams) lvl0_tune_analysis = train_lvl0_population(hp=hparams) tune_analysis_lvl1 = train_lvl1_agents( hp_lvl1=copy.deepcopy(hparams), tune_analysis_lvl0=lvl0_tune_analysis) print(tune_analysis_lvl1.results_df.columns) print(tune_analysis_lvl1.results_df.head()) ray.shutdown()
def main(debug, use_rllib_policy=False): """ The planner is not modified yet to work with policies/agents created with RLLib. """ if use_rllib_policy: logger.warning( f"not possible to use the planner with " "use_rllib_polcy: {use_rllib_policy}" ) train_n_replicates = 1 if debug else 5 timestamp = int(time.time()) seeds = [seed + timestamp for seed in list(range(train_n_replicates))] exp_name, _ = log.log_in_current_day_dir("adaptive_mechanism_design") hyperparameters = { "exp_name": exp_name, "seed": tune.grid_search(seeds), "debug": debug, "report_every_n": 1, "fear": 1, # "greed": -1, # Selecting greed = 1 to be sure that # the agents without planner learns DD "greed": 1, # (needed when using the not simple network) "with_redistribution": False, "n_planning_eps": math.inf, "value_fn_variant": "exact", # "value_fn_variant": 'estimated', # "value_fn_variant": tune.grid_search(['exact', 'estimated']), "action_flip_prob": 0, "n_players": 2, "with_planner": True and not use_rllib_policy, # "with_planner": False, # "with_planner": tune.grid_search([True, False]), # "env": "FearGreedMatrix", "env": "CoinGame", "normalize_against_vp": False, "normalize_against_v": False, "normalize_vp_separated": False, "use_rllib_polcy": use_rllib_policy, } hyperparameters = add_env_hp(hyperparameters) train(hyperparameters)
def _init_evaluator(): exp_name, _ = log.log_in_current_day_dir("testing") rllib_config, stop_config = get_rllib_config(seeds=get_random_seeds(1)) evaluator = self_and_cross_perf.SelfAndCrossPlayEvaluator( exp_name=exp_name, ) evaluator.define_the_experiment_to_run( evaluation_config=rllib_config, stop_config=stop_config, TrainerClass=PGTrainer, ) return evaluator
def main(debug): exp_name, _ = log.log_in_current_day_dir(f"LOLA_DICE") tune_hparams = { "debug": debug, "exp_name": exp_name, # "env_name": "IPD", # "env_name": "IMP", "env_name": "CoinGame", # "env_name": "AsymCoinGame", "gamma": None, "trace_length": None, "epochs": 0.2 if debug else 200, "lr_inner": .1, "lr_outer": .2, "lr_value": .1, "lr_om": .1, "inner_asymm": True, "n_agents": 2, "n_inner_steps": 1 if debug else 2, "batch_size": 4 if debug else 64, "value_batch_size": 16, "value_epochs": 0, "om_batch_size": 16, "om_epochs": 0, "grid_size": 3, "use_baseline": False, "use_dice": True, "use_opp_modeling": False, "seed": 1 if debug else tune.grid_search([1, 2, 3, 4, 5]), } tune_config = get_tune_config(tune_hparams) ray.init(num_cpus=os.cpu_count(), num_gpus=0) tune_analysis = tune.run(lola_training, name=tune_hparams["exp_name"], config=tune_config) ray.shutdown() return tune_analysis
def main(debug): train_n_replicates = 2 if debug else 40 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("LOLA_Exact") hparams = { "load_plot_data": None, # Example "load_plot_data": ".../SameAndCrossPlay_save.p", "exp_name": exp_name, "train_n_replicates": train_n_replicates, "env": "IPD", # "env": "IMP", # "env": "AsymBoS", "num_episodes": 5 if debug else 50, "trace_length": 5 if debug else 200, "simple_net": True, "corrections": True, "pseudo": False, "num_hidden": 32, "reg": 0.0, "lr": 1., "lr_correction": 1.0, "gamma": 0.96, "seed": tune.grid_search(seeds), "metric": "ret1", "with_linear_LR_decay_to_zero": False, "clip_update": None, # "with_linear_LR_decay_to_zero": True, # "clip_update": 0.1, # "lr": 0.001, } if hparams["load_plot_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) tune_analysis_per_exp = train(hparams) else: tune_analysis_per_exp = None evaluate(tune_analysis_per_exp, hparams) ray.shutdown()
def _train_pg_in_ipd(train_n_replicates): debug = True stop_iters = 200 tf = False seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("testing") ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) rllib_config, stop_config = get_rllib_config(seeds, debug, stop_iters, tf) tune_analysis = tune.run(PGTrainer, config=rllib_config, stop=stop_config, checkpoint_freq=0, checkpoint_at_end=True, name=exp_name, metric="episode_reward_mean", mode="max") ray.shutdown() return tune_analysis, seeds
def main(debug): train_n_replicates = 1 if debug else 1 timestamp = int(time.time()) seeds = [seed + timestamp for seed in list(range(train_n_replicates))] exp_name, _ = log.log_in_current_day_dir("adaptive_mechanism_design") hyperparameters = { "exp_name": exp_name, "seed": tune.grid_search(seeds), "debug": debug, "report_every_n": 1, "fear": 1, # "greed": -1, "greed": 1, # Selecting greed = 1 to be sure that the agents without planner learns DD # (needed when using the not simple network) "with_redistribution": False, "n_planning_eps": math.inf, "value_fn_variant": 'exact', # "value_fn_variant": 'estimated', # "value_fn_variant": tune.grid_search(['exact', 'estimated']), "action_flip_prob": 0, "n_players": 2, # "with_planner": True, "with_planner": False, # "with_planner": tune.grid_search([True, False]), # "env": "FearGreedMatrix", "env": "CoinGame", "normalize_against_vp": False, "normalize_against_v": False, "normalize_vp_separated": False, } hyperparameters = add_env_hp(hyperparameters) train(hyperparameters)
def get_hyperparameters(debug, env): exp_name, _ = log.log_in_current_day_dir("L1BR_amTFT") train_n_replicates = 4 if debug else 8 pool_of_seeds = miscellaneous.get_random_seeds(train_n_replicates) hparams = { "debug": debug, "filter_utilitarian": False, "train_n_replicates": train_n_replicates, "seeds": pool_of_seeds, "exp_name": exp_name, "welfare_functions": [(postprocessing.WELFARE_UTILITARIAN, "utilitarian")], "amTFTPolicy": amTFT.AmTFTRolloutsTorchPolicy, "explore_during_evaluation": True, "n_seeds_lvl0": train_n_replicates, "n_seeds_lvl1": train_n_replicates // 2, "gamma": 0.96, "temperature_schedule": False, "jitter": 0.05, "hiddens": [64], "env_name": "IteratedPrisonersDilemma", # "env_name": "IteratedAsymBoS", # "env_name": "IteratedAsymChicken", # "env_name": "CoinGame", # "env_name": "AsymCoinGame", "overwrite_reward": True, "reward_uncertainty": 0.0, } if env is not None: hparams["env_name"] = env hparams = amtft_various_env.modify_hyperparams_for_the_selected_env( hparams) return hparams, exp_name
def main(debug): train_n_replicates = 1 if debug else 1 seeds = miscellaneous.get_random_seeds(train_n_replicates) exp_name, _ = log.log_in_current_day_dir("LTFT_IPD") hparameters = { "n_epi": 10 if debug else 200, "n_steps_per_epi": 20, "bs_epi_mul": 4, "base_lr": 0.04, "spl_lr_mul": 10.0, "seeds": seeds, "debug": debug, } rllib_config, env_config, stop = get_rllib_config(hparameters) ray.init(num_cpus=os.cpu_count(), num_gpus=0) print("\n========== Training LTFT in self-play ==========\n") tune_analysis_self_play = ray.tune.run(DQNTrainer, config=rllib_config, verbose=1, checkpoint_freq=0, stop=stop, checkpoint_at_end=True, name=exp_name) print("\n========== Training LTFT against a naive opponent ==========\n") # Set player_col to use a naive policy rllib_config["multiagent"]["policies"][env_config["players_ids"][1]] = ( None, IteratedPrisonersDilemma.OBSERVATION_SPACE, IteratedPrisonersDilemma.ACTION_SPACE, {} ) tune_analysis_naive_opponent = ray.tune.run(DQNTrainer, config=rllib_config, verbose=1, checkpoint_freq=0, stop=stop, checkpoint_at_end=True, name=exp_name) ray.shutdown() return tune_analysis_self_play, tune_analysis_naive_opponent
def main(debug): n_in_lvl0_population = 2 if debug else 40 n_lvl1 = 1 if debug else 1 timestamp = int(time.time()) lvl0_seeds = [ seed + timestamp for seed in list(range(n_in_lvl0_population)) ] lvl1_seeds = list(range(n_lvl1)) exp_name, _ = log.log_in_current_day_dir("L1BR_LOLA_PG") tune_hparams = { "exp_name": exp_name, "load_data": None, # Example: "load_data": ".../lvl1_results.p", "load_population": None, # Example: "load_population": # [".../checkpoint.json", ".../checkpoint.json", ...] "num_episodes": 5 if debug else 2000, "trace_length": 5 if debug else 20, "lr": None, "gamma": 0.5, "batch_size": 5 if debug else 512, # "env_name": "IteratedPrisonersDilemma", # "env_name": "IteratedBoS", # "env_name": "IteratedAsymBoS", "env_name": "VectorizedCoinGame", # "env_name": "AsymVectorizedCoinGame", "pseudo": False, "grid_size": 3, "lola_update": True, "opp_model": False, "mem_efficient": True, "lr_correction": 1, "bs_mul": 1 / 10, "simple_net": True, "hidden": 32, "reg": 0, "set_zero": 0, "exact": False, "warmup": 1, "lvl0_seeds": lvl0_seeds, "lvl1_seeds": lvl1_seeds, "changed_config": False, "ac_lr": 1.0, "summary_len": 1, "use_MAE": False, "use_toolbox_env": True, "clip_loss_norm": False, "clip_lola_update_norm": False, "clip_lola_correction_norm": 3.0, "clip_lola_actor_norm": 10.0, "entropy_coeff": 0.001, "weigth_decay": 0.03, "lola_correction_multiplier": 1, "lr_decay": True, "correction_reward_baseline_per_step": False, "use_critic": False, } rllib_hparams = { "debug": debug, "n_steps_per_epi": 20, "bs_epi_mul": 4, "sgd_momentum": 0.9, "temperature_schedule": False, } if tune_hparams["load_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0) # # Train if tune_hparams["load_population"] is None: results_list_lvl0 = train_lvl0_population(tune_hp=tune_hparams) log.save_metrics(results_list_lvl0, exp_name, "lvl0_results.p") else: results_list_lvl0 = [] results_list_lvl1 = train_lvl1_agents( tune_hp=tune_hparams, rllib_hp=rllib_hparams, results_list_lvl0=results_list_lvl0, ) log.save_metrics(results_list_lvl1, exp_name, "lvl1_results.p", limit=True) ray.shutdown() else: # TODO print that every time, not only when loading log.pprint_saved_metrics( tune_hparams["load_data"], keywords_to_print=[ "policy_reward_mean", "speed.*mean", "own.*mean", "analysis", "^avg$", "last-10-avg", ], )
def main(debug: bool, env=None): """ Train several LOLA_PG pairs of agent on the selected environment and plot their performances in self-play and cross-play. :param debug: selection of debug mode using less compute :param env: option to overwrite the env selection """ train_n_replicates = 2 if debug else 1 timestamp = int(time.time()) seeds = [seed + timestamp for seed in list(range(train_n_replicates))] exp_name, _ = log.log_in_current_day_dir("LOLA_PG") # The InfluenceEvader(like) use_best_exploiter = False # use_best_exploiter = True high_coop_speed_hp = True if use_best_exploiter else False # high_coop_speed_hp = True tune_hparams = { "debug": debug, "exp_name": exp_name, "train_n_replicates": train_n_replicates, # wandb configuration "wandb": None if debug else { "project": "LOLA_PG", "group": exp_name, "api_key_file": os.path.join(os.path.dirname(__file__), "../../../api_key_wandb"), "log_config": True, }, # Print metrics "load_plot_data": None, # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p", # # "gamma": 0.5, # "num_episodes": 3 if debug else 4000 if high_coop_speed_hp else 2000, # "trace_length": 4 if debug else 20, # "lr": None, # # "gamma": 0.875, # "lr": 0.005 / 4, # "num_episodes": 3 if debug else 4000, # "trace_length": 4 if debug else 20, # "gamma": 0.9375, "lr": 0.005 / 4 if debug else tune.grid_search( [0.005 / 4, 0.005 / 4 / 2, 0.005 / 4 / 2 / 2]), "num_episodes": 3 if debug else tune.grid_search([4000, 8000]), "trace_length": 4 if debug else tune.grid_search([40, 80]), # "batch_size": 8 if debug else 512, # "env_name": "IteratedPrisonersDilemma" if env is None else env, # "env_name": "IteratedAsymBoS" if env is None else env, "env_name": "VectorizedCoinGame" if env is None else env, # "env_name": "AsymVectorizedCoinGame" if env is None else env, # "env_name": "VectorizedMixedMotiveCoinGame" if env is None else env, "pseudo": False, "grid_size": 3, "lola_update": True, "opp_model": False, "mem_efficient": True, "lr_correction": 1, "bs_mul": 1 / 10 * 3 if use_best_exploiter else 1 / 10, "simple_net": True, "hidden": 32, "reg": 0, "set_zero": 0, "exact": False, "warmup": 1, "seed": tune.grid_search(seeds), "changed_config": False, "ac_lr": 1.0, "summary_len": 1, "use_MAE": False, "use_toolbox_env": True, "clip_loss_norm": False, "clip_lola_update_norm": False, "clip_lola_correction_norm": 3.0, # "clip_lola_correction_norm": # tune.grid_search([3.0 / 2, 3.0, 3.0 * 2]), "clip_lola_actor_norm": 10.0, # "clip_lola_actor_norm": tune.grid_search([10.0 / 2, 10.0, 10.0 * 2]), "entropy_coeff": 0.001, # "entropy_coeff": tune.grid_search([0.001/2/2, 0.001/2, 0.001]), # "weigth_decay": 0.03, "weigth_decay": 0.03 if debug else tune.grid_search( [0.03 / 8 / 2 / 2, 0.03 / 8 / 2, 0.03 / 8]), # "lola_correction_multiplier": 1, "lola_correction_multiplier": 1 if debug else tune.grid_search([1 * 4, 1 * 4 * 2, 1 * 4 * 2 * 2]), "lr_decay": True, "correction_reward_baseline_per_step": False, "use_critic": False, "plot_keys": [ "reward", "total_reward", "entrop", ], "plot_assemblage_tags": [ ("total_reward", ), ("entrop", ), ], } # Add exploiter hyperparameters tune_hparams.update({ "start_using_exploiter_at_update_n": 1 if debug else 3000 if high_coop_speed_hp else 1500, # PG exploiter "use_PG_exploiter": True if use_best_exploiter else False, "every_n_updates_copy_weights": 1 if debug else 100, # "adding_scaled_weights": False, # "adding_scaled_weights": 0.33, }) if tune_hparams["load_plot_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) tune_analysis_per_exp = _train(tune_hparams) else: tune_analysis_per_exp = None _evaluate(tune_hparams, debug, tune_analysis_per_exp) ray.shutdown()
# del config['match_mode'] # del config['cond_params'] super().setup(config) if __name__ == '__main__': # each logical step of training contains several episodes, each episode is a batch of games training_steps = config['training_episodes'] // config['episodes_per_step'] print(f'Num of training steps: {training_steps}') print(f'Episodes per step: {config["episodes_per_step"]}') # arbitrator in population training hasn't been tested assert not (config['enable_arbitrator'] and isinstance(config['hidden_embedding_sizes'][0], list)) exp_name_expanded, exp_dir = log.log_in_current_day_dir(config['name']) analysis = tune.run( name=exp_name_expanded, run_or_experiment=CrossPlayTraining, stop={"training_iteration": training_steps}, config=config, checkpoint_freq=1000, checkpoint_at_end=True, metric='prosocial_reward', mode='max', ) log.save_metrics(analysis, exp_name_expanded, "metrics.pickle") # log.pprint_saved_metrics(os.path.join(os.path.expanduser('~/ray_results'), exp_name_expanded, 'metrics.pickle'))
def init_worker( n_rollout_replicas, max_steps, actions_list_0=None, actions_list_1=None, actions_list_2=None, actions_list_3=None, ): train_n_replicates = 1 debug = True exp_name, _ = log.log_in_current_day_dir("testing") hparams = get_hyperparameters( debug, train_n_replicates, filter_utilitarian=False, env="IteratedPrisonersDilemma", ) _, _, rllib_config = get_rllib_config( hparams, welfare_fn=postprocessing.WELFARE_UTILITARIAN ) rllib_config["env"] = FakeEnvWtActionAsReward rllib_config["env_config"]["max_steps"] = max_steps rllib_config["seed"] = int(time.time()) for policy_id in FakeEnvWtActionAsReward({}).players_ids: policy_to_modify = list( rllib_config["multiagent"]["policies"][policy_id] ) policy_to_modify[3]["rollout_length"] = max_steps policy_to_modify[3]["n_rollout_replicas"] = n_rollout_replicas policy_to_modify[3]["verbose"] = 1 if actions_list_0 is not None: policy_to_modify[3]["nested_policies"][0][ "Policy_class" ] = make_FakePolicyWtDefinedActions( copy.deepcopy(actions_list_0), DEFAULT_NESTED_POLICY_COOP ) if actions_list_1 is not None: policy_to_modify[3]["nested_policies"][1][ "Policy_class" ] = make_FakePolicyWtDefinedActions( copy.deepcopy(actions_list_1), DEFAULT_NESTED_POLICY_SELFISH ) if actions_list_2 is not None: policy_to_modify[3]["nested_policies"][2][ "Policy_class" ] = make_FakePolicyWtDefinedActions( copy.deepcopy(actions_list_2), DEFAULT_NESTED_POLICY_COOP ) if actions_list_3 is not None: policy_to_modify[3]["nested_policies"][3][ "Policy_class" ] = make_FakePolicyWtDefinedActions( copy.deepcopy(actions_list_3), DEFAULT_NESTED_POLICY_SELFISH ) rllib_config["multiagent"]["policies"][policy_id] = tuple( policy_to_modify ) dqn_trainer = DQNTrainer( rllib_config, logger_creator=_get_logger_creator(exp_name) ) worker = dqn_trainer.workers._local_worker am_tft_policy_row = worker.get_policy("player_row") am_tft_policy_col = worker.get_policy("player_col") am_tft_policy_row.working_state = WORKING_STATES[2] am_tft_policy_col.working_state = WORKING_STATES[2] return worker, am_tft_policy_row, am_tft_policy_col
def main(debug): exp_name, _ = log.log_in_current_day_dir("InequityAversion") ray.init(num_cpus=os.cpu_count(), num_gpus=0) stop = {"episodes_total": 10 if debug else 400} env_config = { "max_steps": 10, "players_ids": ["player_row", "player_col"], } policies = { env_config["players_ids"][0]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}), env_config["players_ids"][1]: (None, IteratedBoSAndPD.OBSERVATION_SPACE, IteratedBoSAndPD.ACTION_SPACE, {}) } rllib_config = { "env": IteratedBoSAndPD, "env_config": env_config, "num_gpus": 0, "num_workers": 1, "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: agent_id), }, "framework": "torch", "gamma": 0.5, "callbacks": callbacks.merge_callbacks( log.get_logging_callbacks_class(), postprocessing.OverwriteRewardWtWelfareCallback), } MyPGTorchPolicy = PGTorchPolicy.with_updates( postprocess_fn=miscellaneous.merge_policy_postprocessing_fn( postprocessing.welfares_postprocessing_fn( add_inequity_aversion_welfare=True, inequity_aversion_beta=1.0, inequity_aversion_alpha=0.0, inequity_aversion_gamma=1.0, inequity_aversion_lambda=0.5), pg_torch_policy.post_process_advantages)) MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy, get_policy_class=None) tune_analysis = tune.run(MyPGTrainer, stop=stop, checkpoint_freq=10, config=rllib_config, name=exp_name) ray.shutdown() return tune_analysis
def main(debug, train_n_replicates=None, filter_utilitarian=None): train_n_replicates = 1 if debug else train_n_replicates train_n_replicates = 40 if train_n_replicates is None else train_n_replicates n_times_more_utilitarians_seeds = 4 pool_of_seeds = miscellaneous.get_random_seeds( train_n_replicates * (1 + n_times_more_utilitarians_seeds)) exp_name, _ = log.log_in_current_day_dir("amTFT") hparams = { "debug": debug, "filter_utilitarian": filter_utilitarian if filter_utilitarian is not None else not debug, "train_n_replicates": train_n_replicates, "n_times_more_utilitarians_seeds": n_times_more_utilitarians_seeds, "load_plot_data": None, # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p", "exp_name": exp_name, "n_steps_per_epi": 20, "bs_epi_mul": 4, "welfare_functions": [(postprocessing.WELFARE_INEQUITY_AVERSION, "inequity_aversion"), (postprocessing.WELFARE_UTILITARIAN, "utilitarian")], "seeds": pool_of_seeds, "amTFTPolicy": amTFT.amTFTRolloutsTorchPolicy, "explore_during_evaluation": True, "gamma": 0.5, "lambda": 0.9, "alpha": 0.0, "beta": 1.0, "temperature_schedule": False, "debit_threshold": 4.0, "jitter": 0.05, "hiddens": [64], # If not in self play then amTFT will be evaluated against a naive selfish policy "self_play": True, # "self_play": False, # Not tested # "env": matrix_sequential_social_dilemma.IteratedPrisonersDilemma, # "utilitarian_filtering_threshold": -2.5, "env": matrix_sequential_social_dilemma.IteratedAsymBoS, "utilitarian_filtering_threshold": 3.2, # "env": matrix_sequential_social_dilemma.IteratedAsymChicken, # "utilitarian_filtering_threshold": ..., # "env": coin_game.CoinGame # "env": coin_game.AsymCoinGame # "utilitarian_filtering_threshold": ..., # For training speed "min_iter_time_s": 0.0 if debug else 3.0, "overwrite_reward": True, "use_adam": False, } hparams = modify_hyperparams_for_the_selected_env(hparams) if hparams["load_plot_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=hparams["debug"]) # Train tune_analysis_per_welfare = train_for_each_welfare_function(hparams) # Eval & Plot analysis_metrics_per_mode = evaluate_self_and_cross_perf( tune_analysis_per_welfare, hparams) ray.shutdown() else: tune_analysis_per_welfare = None # Plot analysis_metrics_per_mode = evaluate_self_and_cross_perf( tune_analysis_per_welfare, hparams) return tune_analysis_per_welfare, analysis_metrics_per_mode
def env_name(): exp_name, _ = log.log_in_current_day_dir("testing") return exp_name
def main(debug): train_n_replicates = 2 if debug else 40 timestamp = int(time.time()) seeds = [seed + timestamp for seed in list(range(train_n_replicates))] exp_name, _ = log.log_in_current_day_dir("LOLA_PG") use_best_exploiter = False # use_best_exploiter = True high_coop_speed_hp = True if use_best_exploiter else False # high_coop_speed_hp = True tune_hparams = { "debug": debug, "exp_name": exp_name, "train_n_replicates": train_n_replicates, # Print metrics "load_plot_data": None, # Example: "load_plot_data": ".../SameAndCrossPlay_save.p", # Dynamically set "num_episodes": 3 if debug else 4000 if high_coop_speed_hp else 2000, # "num_episodes": tune.grid_search([2000, 4000, 6000]), "trace_length": 4 if debug else 20, "lr": None, "gamma": 0.5, "batch_size": 8 if debug else 512, # "env": IteratedPrisonersDilemma, # "env": IteratedBoS, # "env": IteratedAsymBoS, "env": CoinGame, # "env": AsymCoinGame, "pseudo": False, "grid_size": 3, "lola_update": True, "opp_model": False, "mem_efficient": True, "lr_correction": 1, "bs_mul": 1 / 10 * 3 if use_best_exploiter else 1 / 10, "simple_net": True, "hidden": 32, "reg": 0, "set_zero": 0, "exact": False, "warmup": 1, "seed": tune.grid_search(seeds), "changed_config": False, "ac_lr": 1.0, "summary_len": 1, "use_MAE": False, # "use_toolbox_env": True, "clip_loss_norm": False, "clip_lola_update_norm": False, "clip_lola_correction_norm": 3.0, "clip_lola_actor_norm": 10.0, "entropy_coeff": 0.001, "weigth_decay": 0.03, "lola_correction_multiplier": 1, # "lola_correction_multiplier": tune.grid_search([1, 0.75, 0.5, 0.25]), "lr_decay": True, "correction_reward_baseline_per_step": False, "use_critic": False, } # Add exploiter hyperparameters tune_hparams.update({ "playing_against_exploiter": False, # "playing_against_exploiter": True, "start_using_exploiter_at_update_n": 1 if debug else 3000 if high_coop_speed_hp else 1500, # "use_exploiter_on_fraction_of_batch": 0.5 if debug else 1.0, "use_exploiter_on_fraction_of_batch": 0.5 if debug else 0.1, # DQN exploiter "use_DQN_exploiter": False, # "use_DQN_exploiter": True, "train_exploiter_n_times_per_epi": 3, "exploiter_base_lr": 0.1, "exploiter_decay_lr_in_n_epi": 3000 if high_coop_speed_hp else 1500, "exploiter_stop_training_after_n_epi": 3000 if high_coop_speed_hp else 1500, "exploiter_rolling_avg": 0.9, "always_train_PG": True, # (is not None) DQN exploiter use thresholds on opp cooperation to switch between policies # otherwise the DQN exploiter will use the best policy (from simulated reward) # "exploiter_thresholds": None, "exploiter_thresholds": [0.6, 0.7] if debug else [0.80, 0.95], # PG exploiter # "use_PG_exploiter": False, "use_PG_exploiter": True if use_best_exploiter else False, "every_n_updates_copy_weights": 1 if debug else 100, "adding_scaled_weights": False, # "adding_scaled_weights": 0.33, # Destabilizer exploiter "use_destabilizer": True, # "use_destabilizer": False, }) if tune_hparams["load_plot_data"] is None: ray.init(num_cpus=os.cpu_count(), num_gpus=0, local_mode=debug) tune_analysis_per_exp = train(tune_hparams) else: tune_analysis_per_exp = None evaluate(tune_hparams, debug, tune_analysis_per_exp) ray.shutdown()
def get_hyperparameters( debug, train_n_replicates=None, filter_utilitarian=None, env=None, reward_uncertainty=0.0, ): if debug: train_n_replicates = 2 n_times_more_utilitarians_seeds = 1 elif train_n_replicates is None: n_times_more_utilitarians_seeds = 4 train_n_replicates = 4 else: n_times_more_utilitarians_seeds = 4 n_seeds_to_prepare = train_n_replicates * (1 + n_times_more_utilitarians_seeds) pool_of_seeds = miscellaneous.get_random_seeds(n_seeds_to_prepare) exp_name, _ = log.log_in_current_day_dir("amTFT") hparams = { "debug": debug, "filter_utilitarian": filter_utilitarian if filter_utilitarian is not None else not debug, "seeds": pool_of_seeds, "train_n_replicates": train_n_replicates, "n_times_more_utilitarians_seeds": n_times_more_utilitarians_seeds, "exp_name": exp_name, "log_n_points": 250, "load_plot_data": None, # Example: "load_plot_data": ".../SelfAndCrossPlay_save.p", "load_policy_data": None, # "load_policy_data": { # "Util": [ # ".../IBP/amTFT/trials/" # "DQN_AsymCoinGame_...", # ".../IBP/amTFT/trials/" # "DQN_AsymCoinGame_..."], # 'IA':[ # ".../temp/IBP/amTFT/trials/" # "DQN_AsymCoinGame_...", # ".../IBP/amTFT/trials/" # "DQN_AsymCoinGame_..."], # }, # "load_policy_data": { # "Util": [ # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT" # "/2021_03_28/19_38_55/utilitarian_welfare/coop" # "/DQN_VectMixedMotiveCG_06231_00000_0_seed=1616960338_2021-03-29_00-52-23/checkpoint_250/checkpoint-250", # # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT" # # "/2021_03_24/18_22_47/utilitarian_welfare/coop" # # "/DQN_VectMixedMotiveCG_e1de7_00001_1_seed=1616610171_2021-03-25_00-27-29/checkpoint_250/checkpoint-250", # # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible/amTFT" # # "/2021_03_24/18_22_47/utilitarian_welfare/coop" # # "/DQN_VectMixedMotiveCG_e1de7_00002_2_seed=1616610172_2021-03-25_00-27-29/checkpoint_250/checkpoint-250", # ], # 'IA':[ # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible" # "/amTFT/2021_03_28/19_38_55/inequity_aversion_welfare/coop" # "/DQN_VectMixedMotiveCG_d5a2a_00000_0_seed=1616960335_2021-03-28_21-23-26/checkpoint_250/checkpoint-250", # # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible" # # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop" # # "/DQN_VectMixedMotiveCG_9cfe6_00001_1_seed=1616610168_2021-03-24_20-22-11/checkpoint_250/checkpoint-250", # # "~/dev-maxime/CLR/vm-data/instance-60-cpu-1-preemtible" # # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop" # # "/DQN_VectMixedMotiveCG_9cfe6_00002_2_seed=1616610169_2021-03-24_20-22-11/checkpoint_250/checkpoint-250", # ], # }, # "load_policy_data": { # "Util": [ # "~/ray_results/amTFT" # "/2021_03_24/18_22_47/utilitarian_welfare/coop" # "/DQN_VectMixedMotiveCG_e1de7_00000_0_seed=1616610170_2021-03-25_00-27-29/checkpoint_250/checkpoint-250", # "~/ray_results/amTFT" # "/2021_03_24/18_22_47/utilitarian_welfare/coop" # "/DQN_VectMixedMotiveCG_e1de7_00001_1_seed=1616610171_2021-03-25_00-27-29/checkpoint_250/checkpoint-250", # "~/ray_results/amTFT" # "/2021_03_24/18_22_47/utilitarian_welfare/coop" # "/DQN_VectMixedMotiveCG_e1de7_00002_2_seed=1616610172_2021-03-25_00-27-29/checkpoint_250/checkpoint-250", # ], # 'IA': [ # "~/ray_results" # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop" # "/DQN_VectMixedMotiveCG_9cfe6_00000_0_seed=1616610167_2021-03-24_20-22-10/checkpoint_250/checkpoint-250", # "~/ray_results" # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop" # "/DQN_VectMixedMotiveCG_9cfe6_00001_1_seed=1616610168_2021-03-24_20-22-11/checkpoint_250/checkpoint-250", # "~/ray_results" # "/amTFT/2021_03_24/18_22_47/inequity_aversion_welfare/coop" # "/DQN_VectMixedMotiveCG_9cfe6_00002_2_seed=1616610169_2021-03-24_20-22-11/checkpoint_250/checkpoint-250", # ], # }, "amTFTPolicy": amTFT.AmTFTRolloutsTorchPolicy, "welfare_functions": [ (postprocessing.WELFARE_INEQUITY_AVERSION, "inequity_aversion"), (postprocessing.WELFARE_UTILITARIAN, "utilitarian"), ], "jitter": 0.05, "hiddens": [64], "gamma": 0.96, # If not in self play then amTFT # will be evaluated against a naive selfish policy or an exploiter "self_play": True, # "self_play": False, # Not tested "env_name": "IteratedPrisonersDilemma" if env is None else env, # "env_name": "IteratedAsymBoS" if env is None else env, # "env_name": "CoinGame" if env is None else env, # "env_name": "AsymCoinGame" if env is None else env, # "env_name": "MixedMotiveCoinGame" if env is None else env, # "env_name": "SSDMixedMotiveCoinGame" if env is None else env, "overwrite_reward": True, "explore_during_evaluation": True, "reward_uncertainty": reward_uncertainty, } hparams = modify_hyperparams_for_the_selected_env(hparams) hparams["plot_keys"] = amTFT.PLOT_KEYS + hparams["plot_keys"] hparams["plot_assemblage_tags"] = (amTFT.PLOT_ASSEMBLAGE_TAGS + hparams["plot_assemblage_tags"]) return hparams