def generate_fake_multiagent_batch(env, policies): multi_agent_batch_builder = MultiAgentSampleBatchBuilder( policy_map={ player: policy for player, policy in zip(env.players_ids, policies) }, clip_rewards=False, callbacks=DefaultCallbacks(), ) fake_actions = generate_fake_discrete_actions(env) env.reset() observations, rewards, done, info = env.step(fake_actions) for player_id in env.players_ids: step_player_values = { "eps_id": 0, "obs": observations[player_id], "new_obs": observations[player_id], "actions": fake_actions[player_id], "prev_actions": fake_actions[player_id], "rewards": rewards[player_id], "prev_rewards": rewards[player_id], "dones": True, } multi_agent_batch_builder.add_values( agent_id=player_id, policy_id=player_id, **step_player_values ) multiagent_batch = multi_agent_batch_builder.build_and_reset() return multiagent_batch
def get_fake_training_batch_for_ppo_in_ipd(policy): policy_id = "fake_player" players = {policy_id: policy} multi_agent_batch_builder = MultiAgentSampleBatchBuilder( policy_map={ player_id: player for player_id, player in players.items() }, clip_rewards=False, callbacks=DefaultCallbacks(), ) n_steps_in_epi = 20 for step_n in range(n_steps_in_epi): step_player_values = { SampleBatch.EPS_ID: 0, SampleBatch.OBS: 0, SampleBatch.NEXT_OBS: 0, SampleBatch.ACTIONS: 0, SampleBatch.REWARDS: random.randint(0, 10), SampleBatch.PREV_REWARDS: random.randint(0, 10), SampleBatch.VF_PREDS: random.randint(0, 10), SampleBatch.DONES: step_n == n_steps_in_epi - 1, SampleBatch.ACTION_DIST_INPUTS: [random.random(), random.random()], SampleBatch.ACTION_LOGP: random.random(), } multi_agent_batch_builder.add_values(agent_id=policy_id, policy_id=policy_id, **step_player_values) multiagent_batch = multi_agent_batch_builder.build_and_reset() return multiagent_batch.policy_batches[policy_id]
def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: #===MOD=== # We use the below. #===MOD=== return MultiAgentSampleBatchBuilder( policies, clip_rewards, callbacks.get("on_postprocess_traj"))
def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return MultiAgentSampleBatchBuilder(policies, clip_rewards)
def __init__( self, env, batch_size, trace_length, grid_size, exploiter_base_lr, exploiter_decay_lr_in_n_epi, exploiter_stop_training_after_n_epi, train_exploiter_n_times_per_epi, ): self.stop_training_after_n_epi = exploiter_stop_training_after_n_epi self.train_exploiter_n_times_per_epi = train_exploiter_n_times_per_epi # with tf.variable_scope(f"dqn_exploiter"): # Create the dqn policy for the exploiter dqn_config = copy.deepcopy(DEFAULT_CONFIG) dqn_config.update({ "prioritized_replay": False, "double_q": True, "buffer_size": 50000, "dueling": False, "learning_starts": min(int((batch_size - 1) * (trace_length - 1)), 64), "model": { "dim": grid_size, "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]], # [Channel, [Kernel, Kernel], Stride]] # "fcnet_hiddens": [self.env.NUM_ACTIONS], "max_seq_len": trace_length, # Number of hidden layers for fully connected net "fcnet_hiddens": [64], # Nonlinearity for fully connected net (tanh, relu) "fcnet_activation": "relu", }, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "rollout_fragment_length": 1, # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": min(int((batch_size) * (trace_length)), 64), "explore": False, "grad_clip": 1, "gamma": 0.5, "lr": exploiter_base_lr, # Learning rate schedule "lr_schedule": [ (0, exploiter_base_lr / 1000), (100, exploiter_base_lr), (exploiter_decay_lr_in_n_epi, exploiter_base_lr / 1e9), ], "sgd_momentum": 0.9, }) print("dqn_config", dqn_config) self.local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=dqn_config["learning_starts"], buffer_size=dqn_config["buffer_size"], replay_batch_size=dqn_config["train_batch_size"], replay_mode=dqn_config["multiagent"]["replay_mode"], replay_sequence_length=dqn_config["replay_sequence_length"], ) # self.dqn_exploiter = DQNTFPolicy(obs_space=self.env.OBSERVATION_SPACE, # action_space=self.env.ACTION_SPACE, # config=dqn_config) def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer": return torch.optim.SGD( policy.q_func_vars, lr=policy.cur_lr, momentum=config["sgd_momentum"], ) MyDQNTorchPolicy = DQNTorchPolicy.with_updates( optimizer_fn=sgd_optimizer_dqn) self.dqn_policy = MyDQNTorchPolicy( obs_space=env.OBSERVATION_SPACE, action_space=env.ACTION_SPACE, config=dqn_config, ) self.multi_agent_batch_builders = [ MultiAgentSampleBatchBuilder( policy_map={"player_blue": self.dqn_policy}, clip_rewards=False, callbacks=DefaultCallbacks(), ) # for _ in range(self.batch_size) ]
def _init_algo(self, fear, greed, n_players, use_simple_agents, action_flip_prob, max_reward_strength, value_fn_variant, cost_param, with_redistribution, n_planning_eps, env_config, n_units, n_episodes, env, lr, gamma, weight_decay, loss_mul_planner, mean_theta, with_planner, std_theta, add_state_grad, planner_momentum, planner_clip_norm, entropy_coeff, seed, normalize_planner, no_weights_decay_planner, planner_std_theta_mul, use_adam_optimizer, use_softmax_hot, report_every_n, momentum, weight_decay_pl_mul, square_cost, normalize_against_v, use_v_pl, normalize_against_vp, normalize_vp_separated, use_rllib_polcy, **kwargs): if not use_simple_agents: speed_ratio = 5.0 lr = lr / speed_ratio loss_mul_planner = loss_mul_planner * speed_ratio**2 / 2 / 2 cost_param = cost_param * 1.5 if n_units == 64: lr = lr / 8 print("args not used:", kwargs) convert_a_to_one_hot = not use_simple_agents np.random.seed(seed) tf.set_random_seed(seed) random.seed(seed) if env == "FearGreedMatrix": env = define_greed_fear_matrix_game(fear=fear, greed=greed)(env_config) elif env == "CoinGame": env = CoinGame(env_config) env.seed(seed=seed) agents = create_population(env, n_players, use_simple_agents=use_simple_agents, n_units=n_units, lr=lr, gamma=gamma, weight_decay=weight_decay, mean_theta=mean_theta, std_theta=std_theta, entropy_coeff=entropy_coeff, use_adam_optimizer=use_adam_optimizer, momentum=momentum, use_rllib_polcy=use_rllib_polcy) np.random.seed(seed + 1) tf.set_random_seed(seed + 1) random.seed(seed + 1) if with_planner: std_theta = std_theta * planner_std_theta_mul weight_decay = weight_decay * weight_decay_pl_mul if no_weights_decay_planner: weight_decay = 0.0 planning_agent = Planning_Agent( env, agents, learning_rate=lr, max_reward_strength=max_reward_strength, cost_param=cost_param, with_redistribution=with_redistribution, value_fn_variant=value_fn_variant, n_units=n_units, weight_decay=weight_decay, convert_a_to_one_hot=convert_a_to_one_hot, loss_mul_planner=loss_mul_planner, mean_theta=mean_theta, std_theta=std_theta, planner_clip_norm=planner_clip_norm, normalize_planner=normalize_planner, add_state_grad=add_state_grad, planner_momentum=planner_momentum, use_adam_optimizer=use_adam_optimizer, use_softmax_hot=use_softmax_hot, square_cost=square_cost, normalize_against_v=normalize_against_v, use_v_pl=use_v_pl, normalize_against_vp=normalize_against_vp, normalize_vp_separated=normalize_vp_separated) else: planning_agent = None self.epi_n = 0 self.players = agents self.env = env self.action_flip_prob = action_flip_prob self.planning_agent = planning_agent self.with_redistribution = with_redistribution self.n_planning_eps = n_planning_eps self.player_ids = env.players_ids self.n_players = n_players self.n_episodes = n_episodes self.max_reward_strength = max_reward_strength self.cost_param = cost_param self.value_fn_variant = value_fn_variant self.fear = fear self.greed = greed self.report_every_n = report_every_n self.normalize_vp_separated = normalize_vp_separated self.use_rllib_polcy = use_rllib_polcy self.avg_planning_rewards_per_round = [] self.episode_reward = [] self.training_epi_avg_reward = [] if self.use_rllib_polcy: self.multi_agent_batch_builder = MultiAgentSampleBatchBuilder( policy_map={ idx: player for idx, player in enumerate(self.players) }, clip_rewards=False, callbacks=DefaultCallbacks())
class AdaptiveMechanismDesign(tune.Trainable): def _init_algo(self, fear, greed, n_players, use_simple_agents, action_flip_prob, max_reward_strength, value_fn_variant, cost_param, with_redistribution, n_planning_eps, env_config, n_units, n_episodes, env, lr, gamma, weight_decay, loss_mul_planner, mean_theta, with_planner, std_theta, add_state_grad, planner_momentum, planner_clip_norm, entropy_coeff, seed, normalize_planner, no_weights_decay_planner, planner_std_theta_mul, use_adam_optimizer, use_softmax_hot, report_every_n, momentum, weight_decay_pl_mul, square_cost, normalize_against_v, use_v_pl, normalize_against_vp, normalize_vp_separated, use_rllib_polcy, **kwargs): if not use_simple_agents: speed_ratio = 5.0 lr = lr / speed_ratio loss_mul_planner = loss_mul_planner * speed_ratio**2 / 2 / 2 cost_param = cost_param * 1.5 if n_units == 64: lr = lr / 8 print("args not used:", kwargs) convert_a_to_one_hot = not use_simple_agents np.random.seed(seed) tf.set_random_seed(seed) random.seed(seed) if env == "FearGreedMatrix": env = define_greed_fear_matrix_game(fear=fear, greed=greed)(env_config) elif env == "CoinGame": env = CoinGame(env_config) env.seed(seed=seed) agents = create_population(env, n_players, use_simple_agents=use_simple_agents, n_units=n_units, lr=lr, gamma=gamma, weight_decay=weight_decay, mean_theta=mean_theta, std_theta=std_theta, entropy_coeff=entropy_coeff, use_adam_optimizer=use_adam_optimizer, momentum=momentum, use_rllib_polcy=use_rllib_polcy) np.random.seed(seed + 1) tf.set_random_seed(seed + 1) random.seed(seed + 1) if with_planner: std_theta = std_theta * planner_std_theta_mul weight_decay = weight_decay * weight_decay_pl_mul if no_weights_decay_planner: weight_decay = 0.0 planning_agent = Planning_Agent( env, agents, learning_rate=lr, max_reward_strength=max_reward_strength, cost_param=cost_param, with_redistribution=with_redistribution, value_fn_variant=value_fn_variant, n_units=n_units, weight_decay=weight_decay, convert_a_to_one_hot=convert_a_to_one_hot, loss_mul_planner=loss_mul_planner, mean_theta=mean_theta, std_theta=std_theta, planner_clip_norm=planner_clip_norm, normalize_planner=normalize_planner, add_state_grad=add_state_grad, planner_momentum=planner_momentum, use_adam_optimizer=use_adam_optimizer, use_softmax_hot=use_softmax_hot, square_cost=square_cost, normalize_against_v=normalize_against_v, use_v_pl=use_v_pl, normalize_against_vp=normalize_against_vp, normalize_vp_separated=normalize_vp_separated) else: planning_agent = None self.epi_n = 0 self.players = agents self.env = env self.action_flip_prob = action_flip_prob self.planning_agent = planning_agent self.with_redistribution = with_redistribution self.n_planning_eps = n_planning_eps self.player_ids = env.players_ids self.n_players = n_players self.n_episodes = n_episodes self.max_reward_strength = max_reward_strength self.cost_param = cost_param self.value_fn_variant = value_fn_variant self.fear = fear self.greed = greed self.report_every_n = report_every_n self.normalize_vp_separated = normalize_vp_separated self.use_rllib_polcy = use_rllib_polcy self.avg_planning_rewards_per_round = [] self.episode_reward = [] self.training_epi_avg_reward = [] if self.use_rllib_polcy: self.multi_agent_batch_builder = MultiAgentSampleBatchBuilder( policy_map={ idx: player for idx, player in enumerate(self.players) }, clip_rewards=False, callbacks=DefaultCallbacks()) def setup(self, config): print("_init_algo", config) self._init_algo(**config) def step(self): (loss, cost, extra_loss, g_V, planning_rs, mean_v, mean_vp, planning_reward_when_pick_own_coin, planning_reward_when_pick_opp_coin, planning_reward_when_no_picking, planning_reward_when_specific_action) = [None] * 11 for _ in range(self.report_every_n): self.epi_n += 1 to_report = {"episodes_total": self.epi_n} s_rllib_format = self.env.reset() obs_before_act = convert_from_rllib_env_format( s_rllib_format, self.player_ids, state=True, n_states=self.env.n_features, coin_game=self.env.NAME == "CoinGame", ) flag = isinstance(obs_before_act, list) cum_planning_rs = [0] * len(self.players) planning_reward_when_pick_own_coin = [None] * len(self.players) planning_reward_when_pick_opp_coin = [None] * len(self.players) planning_reward_when_no_picking = [None] * len(self.players) planning_reward_when_specific_action = [ [None] * self.env.NUM_ACTIONS ] * len(self.players) if self.use_rllib_polcy: prev_r = [0.0] * len(self.players) prev_a = [0] * len(self.players) done = False while not done: # choose action based on s if self.use_rllib_polcy: actions = [ player.compute_actions(obs_before_act[None, ...])[0][0] for player in self.players ] else: if flag: actions = [ player.choose_action(obs_before_act[idx]) for idx, player in enumerate(self.players) ] else: actions = [ player.choose_action(obs_before_act) for player in self.players ] actions_rllib_format = convert_to_rllib_env_format( actions, self.player_ids, coin_game=self.env.NAME == "CoinGame") # take action and get next s and reward s_rllib_format, rewards_rllib_format, done_rllib_format, info_rllib_format = self.env.step( actions_rllib_format) obs_after_act = convert_from_rllib_env_format( s_rllib_format, self.player_ids, state=True, n_states=self.env.n_features, coin_game=self.env.NAME == "CoinGame") rewards = convert_from_rllib_env_format( rewards_rllib_format, self.player_ids) done = convert_from_rllib_env_format(done_rllib_format, self.player_ids) self.episode_reward.append(rewards) # perturbed_actions = [(1 - a if np.random.binomial(1, self.action_flip_prob) else a) for a in actions] # Make it work for discrete action space of N perturbed_actions = [] for a in actions: if np.random.binomial(1, self.action_flip_prob): perturbed_a = a while perturbed_a == a: perturbed_a = random.randint( 0, self.env.n_features - 1) print("perturbed_a == a", perturbed_a, a, perturbed_a == a) else: perturbed_actions.append(a) # print("perturbed_actions", perturbed_actions) env_rewards = rewards if self.planning_agent is not None and self.epi_n < self.n_planning_eps: planning_rs = self.planning_agent.choose_action( obs_before_act, perturbed_actions) if self.with_redistribution: sum_planning_r = sum(planning_rs) mean_planning_r = sum_planning_r / self.n_players planning_rs = [ r - mean_planning_r for r in planning_rs ] rewards = [sum(r) for r in zip(rewards, planning_rs)] cum_planning_rs = [ sum(r) for r in zip(cum_planning_rs, planning_rs) ] # Training planning agent # TODO using the past rewards is not working since I perturbate the actions (action, loss, g_Vp, g_V, r_players, cost, extra_loss, l1, mean_v, vp, values, mean_vp) = self.planning_agent.learn( obs_before_act, perturbed_actions, coin_game=self.env.NAME == "CoinGame", env_rewards=env_rewards) for idx, player in enumerate(self.players): if self.use_rllib_polcy: step_player_values = { "eps_id": self.epi_n, "obs": obs_before_act, # [None,...] "new_obs": obs_after_act, "actions": actions[idx], "prev_actions": prev_a[idx], "rewards": rewards[idx], "prev_rewards": prev_r[idx], "dones": done, "vf_preds": player._value(obs_before_act, prev_a[idx], prev_r[idx]) } self.multi_agent_batch_builder.add_values( agent_id=idx, policy_id=idx, **step_player_values) else: if flag: critic_loss, advantage = player.learn( obs_before_act[idx], actions[idx], rewards[idx], obs_after_act[idx], obs_before_act, obs_after_act) else: critic_loss, advantage = player.learn( obs_before_act, actions[idx], rewards[idx], obs_after_act) to_report[f"critic_loss_p_{idx}"] = critic_loss[0, 0] to_report[f"advantage_loss_p_{idx}"] = advantage if self.planning_agent is not None: opp_idx = (idx + 1) % 2 if env_rewards[idx] == 1.0 and env_rewards[ opp_idx] == 0.0: planning_reward_when_pick_own_coin[ idx] = planning_rs[idx] if env_rewards[idx] == 1.0 and env_rewards[ opp_idx] == -2.0: planning_reward_when_pick_opp_coin[ idx] = planning_rs[idx] if env_rewards[idx] == 0.0 and env_rewards[ opp_idx] == 0.0: planning_reward_when_no_picking[idx] = planning_rs[ idx] planning_reward_when_specific_action[idx][ actions[idx]] = planning_rs[idx] if self.use_rllib_polcy and done: multiagent_batch = self.multi_agent_batch_builder.build_and_reset( ) for idx, player in enumerate(self.players): stats = player.learn_on_batch( multiagent_batch.policy_batches[idx]) # Does slow down the training # for k, v in stats["learner_stats"].items(): # to_report[f"p{idx}_{k}"] = v prev_r = rewards prev_a = actions elif done: for player in self.players: player.learn_at_episode_end() # swap s obs_before_act = obs_after_act if self.planning_agent is not None and self.epi_n < self.n_planning_eps: self.avg_planning_rewards_per_round.append([ r / self.env.step_count_in_current_episode for r in cum_planning_rs ]) epi_rewards = np.array(self.episode_reward) self.training_epi_avg_reward.append(np.mean(epi_rewards, axis=0)) self.episode_reward.clear() to_report = self._add_info_to_log( to_report, actions_rllib_format, info_rllib_format, epi_rewards, loss, cost, extra_loss, g_V, planning_rs, mean_v, mean_vp, planning_reward_when_pick_own_coin, planning_reward_when_pick_opp_coin, planning_reward_when_no_picking, planning_reward_when_specific_action) return to_report def _add_info_to_log(self, to_report, actions_rllib_format, info_rllib_format, epi_rewards, loss, cost, extra_loss, g_V, planning_rs, mean_v, mean_vp, planning_reward_when_pick_own_coin, planning_reward_when_pick_opp_coin, planning_reward_when_no_picking, planning_reward_when_specific_action): for k, v in actions_rllib_format.items(): to_report[f"act_{k}"] = v to_report.update(info_rllib_format) to_report["mean_reward_p1"] = np.mean(epi_rewards, axis=0)[0] to_report["mean_reward_p2"] = np.mean(epi_rewards, axis=0)[1] to_report["mean_reward"] = np.sum(np.mean(epi_rewards, axis=0)) if self.planning_agent is not None: to_report[f"loss_planner"] = loss to_report[f"loss_pl_grad"] = cost to_report[f"loss_rew_cost"] = extra_loss to_report[f"g_V"] = g_V planner_weights_norm = self.planning_agent.get_weights_norm() to_report[f"planner_weights_norm"] = planner_weights_norm to_report["planning_reward_player1"] = planning_rs[0] to_report["planning_reward_player2"] = planning_rs[1] to_report["mean_v"] = mean_v if not self.normalize_vp_separated: to_report["mean_vp"] = mean_vp else: to_report["mean_vp0"] = mean_vp[0] to_report["mean_vp1"] = mean_vp[1] for idx, player in enumerate(self.players): if not self.use_rllib_polcy: ac_weights_norm, cr_weights_norm = player.get_weights_norm() to_report[f"actor_weights_norm_p_{idx}"] = ac_weights_norm to_report[f"critic_weights_norm_p_{idx}"] = cr_weights_norm if planning_reward_when_pick_own_coin[idx] is not None: to_report[ f"pl_rw_p{idx}_pick_own_coin"] = planning_reward_when_pick_own_coin[ idx] if planning_reward_when_pick_opp_coin[idx] is not None: to_report[ f"pl_rw_p{idx}_pick_opp_coin"] = planning_reward_when_pick_opp_coin[ idx] if planning_reward_when_no_picking[idx] is not None: to_report[ f"pl_rw_p{idx}_no_picking"] = planning_reward_when_no_picking[ idx] for act_v in range(self.env.NUM_ACTIONS): if planning_reward_when_specific_action[idx][ act_v] is not None: to_report[ f"pl_rw_p{idx}_a{act_v}"] = planning_reward_when_specific_action[ idx][act_v] return to_report