def __init__(self, obs_space_n, act_space_n, agent_index, lr, no_layers, num_units, tau, noise=0.0, use_ounoise=False, logger=None): self.logger = logger assert isinstance(obs_space_n[0], Space) obs_shape_n = u.space_n_to_shape_n(obs_space_n) act_shape_n = u.space_n_to_shape_n(act_space_n) act_type = type(act_space_n[0]) self.policy = MADDPGPolicyNetwork(no_layers, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, agent_index, noise, use_ounoise) self.policy_target = MADDPGPolicyNetwork(no_layers, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, agent_index, use_ounoise, use_ounoise) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.agent_index = agent_index self.tau = tau
def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, _run=None): """ An object containing critic, actor and training functions for Multi-Agent DDPG. """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target.model.set_weights(self.critic.model.get_weights()) self.policy = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.agent_index = agent_index self.decay = gamma self.tau = tau
def __init__(self, no_neighbors, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, logger=None, history_size=0, noise=0.0, use_ounoise=False, temporal_mode='rnn'): """ An object containing critic, actor and training functions for Multi-Agent DDPG. """ self.logger = logger assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) self.no_neighbors = no_neighbors self.no_agents = len(obs_shape_n) self.no_features = obs_shape_n[0][0] self.no_actions = obs_shape_n[0][0] self.k_lst = list(range(self.no_neighbors + 2))[2:] super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps, history_size=history_size) act_type = type(act_space_n[0]) self.critic = MADDPGCriticNetwork(no_neighbors, num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target = MADDPGCriticNetwork(no_neighbors, num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target.model.set_weights(self.critic.model.get_weights()) self.policy = MADDPGPolicyNetwork(history_size, num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index, noise, use_ounoise, temporal_mode) self.policy_target = MADDPGPolicyNetwork(history_size, num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index, noise, use_ounoise, temporal_mode) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.agent_index = agent_index self.decay = gamma self.tau = tau
def main(): no_agents = arglist.no_agents u.create_seed(arglist.seed) env = make_env(arglist.scenario) logger = RLLogger(arglist.exp_name, env.n, env.n_adversaries, arglist.save_rate, arglist) obs_shape_n = u.space_n_to_shape_n(env.observation_space) act_shape_n = u.space_n_to_shape_n(env.action_space) # Result paths model_path = os.path.join("results", arglist.exp_name, 'models') os.makedirs(model_path, exist_ok=True) critic = MADDPGCriticNetwork(arglist.no_layers, arglist.no_critic_neurons, arglist.lr, obs_shape_n, act_shape_n, wd=1e-5) critic_target = MADDPGCriticNetwork(arglist.no_layers, arglist.no_critic_neurons, arglist.lr, obs_shape_n, act_shape_n, wd=1e-5) critic_target.model.set_weights(critic.model.get_weights()) agents = get_agents(env, arglist.lr, arglist.no_layers, arglist.no_actor_neurons, arglist.tau, arglist.noise, arglist.use_ounoise, logger) obs_n = env.reset() replay_buffer = EfficientReplayBuffer(int(arglist.max_buffer_size), no_agents, obs_shape_n, act_shape_n) # Init Buffer # Load previous results if necessary if arglist.restore_fp: print('Loading previous state...') for ag_idx, agent in enumerate(agents): fp = os.path.join(model_path, 'agent_{}'.format(ag_idx)) agent.load(fp) critic.load(model_path + '/critic.h5') critic_target.load(model_path + '/critic_target.h5') print('Starting iterations...') while True: logger.episode_step += 1 action_n = [ agent.action(obs.astype(np.float32)).numpy() for agent, obs in zip(agents, obs_n) ] new_obs_n, rew_n, done_n, _ = env.step(action_n) cooperative_reward = rew_n[0] terminal = (logger.episode_step >= arglist.max_episode_len) done = all(done_n) or terminal # collect experience replay_buffer.add(obs_n, action_n, cooperative_reward, new_obs_n, done) obs_n = new_obs_n if done: obs_n = env.reset() episode_step = 0 logger.record_episode_end(agents, arglist.display) for ag_idx, rew in enumerate(rew_n): logger.cur_episode_reward += cooperative_reward logger.agent_rewards[ag_idx][-1] += cooperative_reward logger.train_step += 1 train_cond = not arglist.display if train_cond and len(replay_buffer) > arglist.batch_size: if len( logger.episode_rewards ) % arglist.update_rate == 0: # only update every 30 episodes for _ in range(arglist.update_times): # Sample: Shapes --> (no-agents, batch_size, features) state, actions, rewards, new_state, dones = replay_buffer.sample( arglist.batch_size) target_act_next = [ a.target_action(obs) for a, obs in zip(agents, new_state) ] target_q_next = critic_target.predict( new_state, target_act_next) q_train_target = rewards + ( 1. - dones) * arglist.gamma * target_q_next loss, td_loss = critic.train_step(state, actions, q_train_target) logger.save_logger("critic_loss", np.mean(td_loss), logger.train_step, 0) update_target_networks(critic, critic_target) critic.save(model_path + '/critic.h5') critic_target.save(model_path + '/critic_target.h5') for agent in agents: pol_loss = agent.update(state, actions, critic, logger.train_step) # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() # saves logger outputs to a file similar to the way in the original MADDPG implementation if len(logger.episode_rewards) > arglist.no_episodes: logger.experiment_end() return logger.get_sacred_results()
def main(arglist): global no_actions, no_features, no_agents env = u.make_env(arglist.scenario, arglist.no_agents) obs_shape_n = env.observation_space act_shape_n = env.action_space act_shape_n = u.space_n_to_shape_n(act_shape_n) no_agents = env.n batch_size = arglist.batch_size no_neighbors = arglist.no_neighbors k_lst = list(range(no_neighbors + 2))[2:] # [2,3] u.create_seed(arglist.seed) noise_mode = OUNoise(act_shape_n[0], scale=1.0) noise = 0.1 reduction_noise = 0.999 # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9 no_features = obs_shape_n[0].shape[0] no_actions = act_shape_n[0][0] model, model_t = __build_conf() optimizer = AdamW(learning_rate=arglist.lr, weight_decay=1e-5) # Results episode_rewards = [0.0] # sum of rewards for all agents result_path = os.path.join("results", arglist.exp_name) res = os.path.join(result_path, " %s.csv" % arglist.exp_name) if not os.path.exists(result_path): os.makedirs(result_path) replay_buffer = ReplayBuffer(arglist.max_buffer_size) # Init Buffer episode_step = 0 train_step = 0 t_start = time.time() obs_n = env.reset() adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True) print('Starting iterations...') while True: episode_step += 1 terminal = (episode_step >= arglist.max_episode_len) if episode_step % 3 == 0: adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True) predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj, model) actions = get_actions(predictions, noise, noise_mode) # Observe next state, reward and done value new_obs_n, rew_n, done_n, _ = env.step(actions) done = all(done_n) or terminal cooperative_reward = rew_n[0] # Store the data in the replay memory replay_buffer.add(obs_n, adj, actions, cooperative_reward, new_obs_n, done) obs_n = new_obs_n episode_rewards[-1] += cooperative_reward if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) # increment global step counter train_step += 1 # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # Train the models train_cond = not arglist.display if train_cond and len(replay_buffer) > arglist.batch_size: if len( episode_rewards ) % arglist.update_rate == 0: # only update every 30 episodes for _ in range(arglist.update_times): state, adj_n, actions, rewards, new_state, dones = replay_buffer.sample( batch_size) noise *= reduction_noise # Calculate TD-target with tf.GradientTape() as tape: target_q_values = model_t([new_state, adj_n]) # Apply max(Q) to obtain the TD-target target_q_tot = tf.reduce_max(target_q_values, axis=-1) # Apply VDN to reduce the agent-dimension max_q_tot = tf.reduce_sum(target_q_tot, axis=-1) y = rewards + (1. - dones) * arglist.gamma * max_q_tot # Predictions action_one_hot = tf.one_hot( tf.argmax(actions, axis=2, name='action_one_hot'), no_actions) q_values = model([state, adj_n]) q_tot = tf.reduce_sum(q_values * action_one_hot, axis=-1, name='q_acted') pred = tf.reduce_sum(q_tot, axis=1) if "huber" in arglist.loss_type: loss = tf.reduce_sum( u.huber_loss(pred, tf.stop_gradient(y))) elif "mse" in arglist.loss_type: loss = tf.losses.mean_squared_error( pred, tf.stop_gradient(y)) else: raise RuntimeError( "Loss function should be either Huber or MSE. %s found!" % arglist.loss_type) gradients = tape.gradient(loss, model.trainable_variables) local_clipped = u.clip_by_local_norm(gradients, 0.1) optimizer.apply_gradients( zip(local_clipped, model.trainable_variables)) tf.saved_model.save(model, result_path) # display training output if train_step % arglist.save_rate == 0: # eval_reward = get_eval_reward(env, model) with open(res, "a+") as f: mes_dict = { "steps": train_step, "episodes": len(episode_rewards), "train_episode_reward": np.round(np.mean(episode_rewards[-arglist.save_rate:]), 3), # "eval_episode_reward": np.round(np.mean(eval_reward), 3), "time": round(time.time() - t_start, 3) } print(mes_dict) for item in list(mes_dict.values()): f.write("%s\t" % item) f.write("\n") f.close() t_start = time.time() # train target model if arglist.soft_update: weights = model.get_weights() target_weights = model_t.get_weights() for w in range(len(weights)): target_weights[w] = arglist.tau * weights[w] + ( 1 - arglist.tau) * target_weights[w] model_t.set_weights(target_weights) elif terminal and train_step % 200 == 0: model_t.set_weights(model.get_weights())