class MATD3AgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train1, self.q_update1, self.q_debug1 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_function_idx=1, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.q_train2, self.q_update2, self.q_debug2 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_func=model, q_function_idx=2, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, p_func=model, q_func=model, #MLPmodel() optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) a.flush() a.close() def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None @property def q_debug(self): return self.q_debug1 def update(self, agents, train_step): if len( self.replay_buffer ) < self.min_replay_buffer_len: # replay buffer is not large enough return if not train_step % self.args.update_rate == 0: return self.replay_sample_index = self.replay_buffer.generate_sample_indices( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] if self.args.use_critic_noise: for agent_idx in range(self.n): noise = np.random.normal( 0, self.args.critic_action_noise_stddev, size=target_act_next_n[agent_idx].shape) clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] + clipped_noise).tolist() elif self.args.use_critic_noise_self: noise = np.random.normal( 0, self.args.critic_action_noise_stddev, size=target_act_next_n[self.agent_index].shape) clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) target_act_next_n[self.agent_index] = target_act_next_n[ self.agent_index] + clipped_noise target_act_next_n = target_act_next_n.tolist() else: target_act_next_n = target_act_next_n target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next = np.min([target_q_next1, target_q_next2], 0) if self.args.critic_zero_if_done: done_cond = done == True target_q_next[done_cond] = 0 target_q = rew + self.args.gamma * target_q_next q_loss = self.q_train1(*(obs_n + act_n + [target_q])) q_loss = self.q_train2(*(obs_n + act_n + [target_q])) # train p network if train_step % (self.args.update_rate * self.args.policy_update_rate) == 0: p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update1() self.q_update2() # print('Agent' + str(self.agent_index) + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss)) # print('Replay buffer size:' + str(len(self.replay_buffer))) return [ q_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) a.flush() a.close() def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.min_replay_buffer_len: # replay buffer is not large enough return if not t % self.args.update_rate == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.generate_sample_indices( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) if self.args.critic_zero_if_done: done_cond = done == True target_q_next[done_cond] = 0 target_q = rew + self.args.gamma * target_q_next q_loss = self.q_train(*(obs_n + act_n + [target_q])) # print('Action gradient = ') # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() #embed() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]