class SheldonPolicy(Policy): def __init__(self, env, landmark_id, args): super(SheldonPolicy, self).__init__() self.env = env self.landmark_id = landmark_id # dummy replay buffer for collecting experiences self.replay_buffer = ReplayBuffer( args.num_episodes * args.max_episode_len if args.benchmark and args.save_replay else 1e6) def action(self, obs): delta_pos = obs[(4 + self.landmark_id * 2):(4 + self.landmark_id * 2 + 2)] # ignore observation and just act based on keyboard events if self.env.discrete_action_input: # not tested! u = 0 horizontal = abs(delta_pos[0]) > abs(delta_pos[1]) if horizontal and delta_pos[0] < 0: u = 1 # LEFT if horizontal and delta_pos[0] > 0: u = 2 # RIGHT if not horizontal and delta_pos[1] < 0: u = 3 # UP if not horizontal and delta_pos[1] > 0: u = 4 # DOWN else: u = np.zeros(5) # 5-d because of no-move action if delta_pos[0] > 0: u[1] += delta_pos[0] # RIGHT if delta_pos[0] < 0: u[2] += -delta_pos[0] # LEFT if delta_pos[1] > 0: u[3] += delta_pos[1] # UP if delta_pos[1] < 0: u[4] += -delta_pos[1] # DOWN #print(delta_pos, u) #return np.concatenate([u, np.zeros(self.env.world.dim_c)]) return u def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done))
class MADDPG(): def __init__(self, obs_shape_n, act_info_n, agent_index, args, local_q_func=False): self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.grad_norm_clipping = 0.5 # Networks self.device = args.device self.vf = Critic( obs_shape_n=obs_shape_n, act_info_n=act_info_n, num_units=args.num_units, q_index=agent_index, local_q_func=local_q_func, ).to(self.device) act_dim, self.pdtype = act_info_n[agent_index] self.pi = MLP(obs_shape_n[agent_index], act_dim, num_units=args.num_units).to(self.device) # Initialize init_params(self.vf) init_params(self.pi) # Target Networks self.pi_targ = deepcopy(self.pi) for p in self.pi_targ.parameters(): p.requires_grad = False self.vf_targ = deepcopy(self.vf) for p in self.vf_targ.parameters(): p.requires_grad = False # Optimizer self.pi_optim = Adam(self.pi.parameters(), lr=args.lr) self.vf_optim = Adam(self.vf.parameters(), lr=args.lr) # Create Replay Buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None @torch.no_grad() def action(self, x): return self.pdtype( self.pi(torch.FloatTensor(x).to( self.device)).cpu()).sample().numpy() def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: return if not (t % 100 == 0): return self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(torch.FloatTensor(obs).to(self.device)) obs_next_n.append(torch.FloatTensor(obs_next).to(self.device)) act_n.append(torch.FloatTensor(act).to(self.device)) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # Create tensors rew = torch.FloatTensor(rew).to(self.device) done = torch.FloatTensor(done).to(self.device) # Calculate q loss num_sample = 1 target_q = 0.0 with torch.no_grad(): for i in range(num_sample): target_act_next_n = [self.pdtype(agents[i].pi_targ(obs_next_n[i])).sample() for i in range(self.n)] target_q_next = self.vf_targ(obs_next_n, target_act_next_n).squeeze(-1) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q = self.vf(obs_n, act_n).squeeze(-1) vf_loss = torch.mean(torch.square(q - target_q)) # optimization step self.vf_optim.zero_grad(set_to_none=True) vf_loss.backward() nn.utils.clip_grad_norm_(self.vf.parameters(), self.grad_norm_clipping) self.vf_optim.step() # Calculate policy loss for p in self.vf.parameters(): p.requires_grad = False piflat = self.pi(obs_n[self.agent_index]) p_reg = torch.mean(torch.square(piflat)) act_input_n = copy(act_n) act_input_n[self.agent_index] = self.pdtype(piflat).sample() pg_loss = - self.vf(obs_n, act_input_n).mean() pi_loss = pg_loss + p_reg * 1e-3 self.pi_optim.zero_grad(set_to_none=True) pi_loss.backward() nn.utils.clip_grad_norm_(self.pi.parameters(), self.grad_norm_clipping) self.pi_optim.step() for p in self.vf.parameters(): p.requires_grad = True make_update_exp(self.pi, self.pi_targ) make_update_exp(self.vf, self.vf_targ) return [pi_loss.item(), vf_loss.item()]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, learning_rate, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.learning_rate = learning_rate self.n = len(obs_shape_n) self.agent_index = agent_index self.obs_size = obs_shape_n[agent_index] self.joint_obs_size = np.sum(obs_shape_n) self.act_size = act_space_n[agent_index].n self.act_pdtype_n = [ make_pdtype(act_space) for act_space in act_space_n ] self.joint_act_size = 0 for i_act in act_space_n: self.joint_act_size += i_act.n self.args = args self.actor = Actor(self.obs_size, self.act_size) self.actor_target = Actor(self.obs_size, self.act_size) self.critic = self.build_critic() self.critic_target = self.build_critic() update_target(self.actor, self.actor_target, 0) update_target(self.critic, self.critic_target, 0) #self.actor, self.critic = self.build_model() #self.actor_target, self.critic_target = self.build_model() self.actor_optimizer = self.build_actor_optimizer() # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None gpu = -1 self.device = "/gpu:{}".format(gpu) if gpu >= 0 else "/cpu:0" def build_model(self): """ actor (policy) neural network """ inp = Input(self.obs_size) x = Dense(64, activation='relu')(inp) x = Dense(64, activation='relu')(x) actor_out = Dense(self.act_size)(x) actor = Model(inp, actor_out) # Note: "actor" is not compiled because we want customize the training process """ critic (value) neural network """ inp = Input((self.joint_obs_size + self.joint_act_size, )) x = Dense(64, activation='relu')(inp) x = Dense(64, activation='relu')(x) critic_out = Dense(1, activation='linear')(x) critic = Model(inp, critic_out) critic.compile(loss="mse", optimizer=Adam(lr=self.learning_rate, clipnorm=0.5)) return actor, critic def build_critic(self): """ critic (value) neural network """ inp = Input((self.joint_obs_size + self.joint_act_size, )) x = Dense(64, activation='relu')(inp) x = Dense(64, activation='relu')(x) critic_out = Dense(1, activation='linear')(x) critic = Model(inp, critic_out) critic.compile(loss="mse", optimizer=Adam(lr=self.learning_rate, clipnorm=0.5)) return critic def build_actor_optimizer(self): return Adam(learning_rate=self.learning_rate, clipnorm=0.5) def action(self, obs): #a = self.sample_action(obs[None]) #print(obs[None].shape) #a = self._get_action_body(tf.constant(obs[None], dtype='float32')) #a = self.actor.predict_on_batch(tf.constant(obs[None], dtype='float32')) #print(a) a = self.actor.action(tf.constant(obs[None], dtype='float32')) ##a = self.actor(self.actor.dist(obs[None])) return a[0] def sample_action(self, obs): logits = self.actor.predict(obs, batch_size=len(obs)) u = np.random.uniform(size=logits.shape) return a """ @tf.function def _get_action_body(self, obs_tensor): with tf.device(self.device): logits = self.actor(obs_tensor) act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(logits) a = act_pd.sample() return a """ @tf.function def _get_action_body(self, obs_tensor): logits = self.actor(obs_tensor) u = tf.random.uniform(tf.shape(logits)) a = tf.nn.softmax(logits - tf.math.log(-tf.math.log(u)), axis=-1) return a def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) rew = np.expand_dims(rew, axis=-1) done = np.expand_dims(done, axis=-1) # train q network num_sample = 1 target_q = 0.0 """ next_logits = self.actor_target.predict(obs_next) next_act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(next_logits) new_next_act = next_act_pd.sample() """ # train critic for i in range(num_sample): target_act_next_n = [] for j in range(self.n): new_next_act = agents[j].actor_target.predict_many( obs_next_n[j]) target_act_next_n.append(new_next_act) #TODO: mode #target_act_next_n[self.agent_index] = new_next_act next_state_action_n = np.concatenate( (obs_next_n, target_act_next_n), axis=-1) next_state_action_attached = np.concatenate(next_state_action_n, axis=0) target_q_next = self.critic_target.predict( next_state_action_attached) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample state_action_n = np.concatenate((obs_n, act_n), axis=-1) state_action_attached = np.concatenate(state_action_n, axis=0) hist = self.critic.fit(state_action_attached, target_q, epochs=1, verbose=0) #q_loss = self.critic.train_on_batch(state_action_attached, target_q) q_loss = hist.history['loss'][0] obs_tensor = tf.constant(obs, dtype=tf.float32) obs_n_tensor = tf.constant(obs_n, dtype=tf.float32) act_n_tensor = tf.constant(act_n, dtype=tf.float32) #obs_tensor = tf.Variable(obs, dtype=tf.float32) #obs_n_tensor = tf.Variable(np.array(obs_n), dtype=tf.float32) #act_n_tensor = tf.Variable(np.array(act_n), dtype=tf.float32) # train actor network #p_loss = self.update_actor(obs, obs_n, act_n) p_loss = self.update_actor(obs_tensor, obs_n_tensor, act_n_tensor) """ logits = self.actor.predict(obs) act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(logits) new_act = act_pd.mode() act_n[self.agent_index] = new_act grads = self.critic.gradients(obs_n, act_n) np.concatenate(act_n, ) state_action_n = np.concatenate((obs_n, act_n), axis=-1) state_action_attached = np.concatenate(state_action_n, axis=-1) hist = self.actor.fit(obs, state_action_attached, epochs=1, verbose=0) p_loss = hist.history['loss'][0] """ update_target(self.actor, self.actor_target) update_target(self.critic, self.critic_target) return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ] @tf.function def update_actor(self, obs, obs_n, act_n): with tf.GradientTape() as tape: logits = self.actor(obs) #new_act = self.act_pdtype_n[self.agent_index].pdfromflat(logits).mode() new_act = self.actor.dist(logits) new_act = tf.expand_dims(new_act, axis=0) new_act_head = act_n[:self.agent_index] new_act_tail = act_n[self.agent_index + 1:] new_act_n = tf.concat((new_act_head, new_act, new_act_tail), axis=0) state_action_n = tf.concat((obs_n, new_act_n), axis=-1) state_action_attached = tf.squeeze(state_action_n) q_val = self.critic(state_action_attached)[:, 0] p_loss = -tf.reduce_mean(q_val) reg_loss = tf.reduce_mean(tf.square(logits)) total_loss = p_loss + reg_loss * 1e-3 actor_grad = tape.gradient(total_loss, self.actor.trainable_weights) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_weights)) return total_loss def _actor_loss(self, actions_and_values, logits): # A trick to input actions and advantages through the same API. actions, values = tf.split(actions_and_values, 2, axis=-1) # Sparse categorical CE loss obj that supports sample_weight arg on `call()`. # `from_logits` argument ensures transformation into normalized probabilities. #weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True) # Policy loss is defined by policy gradients, weighted by advantages. # Note: we only calculate the loss on the actions we've actually taken. #actions = tf.cast(actions, tf.int32) #policy_loss = weighted_sparse_ce(actions, logits, sample_weight=values) # Entropy loss can be calculated as cross-entropy over itself. #probs = tf.nn.softmax(logits) #entropy_loss = kls.categorical_crossentropy(probs, probs) print(values) print(tf.shape(logits)) policy_loss = -tf.reduce_mean(values) logits_loss = tf.reduce_mean(logits) # We want to minimize policy and maximize entropy losses. # Here signs are flipped because the optimizer minimizes. return policy_loss + 1e-3 * logits_loss def _actor_loss(self, values, logits): p_loss = -tf.reduce_mean(values) reg_loss = tf.reduce_mean(tf.square(logits)) total_loss = p_loss + reg_loss * 1e-3 return total_loss def load_models(self, path, version_name): file_name = 'a' + str(self.agent_index) + 'A' + version_name self.actor.load_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'C' + version_name self.critic.load_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'AT' + version_name self.actor_target.load_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'CT' + version_name self.critic_target.load_weights(path + file_name) def save_models(self, path, version_name): file_name = 'a' + str(self.agent_index) + 'A' + version_name self.actor.save_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'C' + version_name self.critic.save_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'AT' + version_name self.actor_target.save_weights(path + file_name) file_name = 'a' + str(self.agent_index) + 'CT' + version_name self.critic_target.save_weights(path + file_name)
class MADDPGAgentTrainerIndepLearner(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, u_estimation=False): print('in here') self.name = name self.n = 1 #len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) self.u_estimation = u_estimation # Create all the functions necessary to train the model l = q_train(scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation) if self.u_estimation: self.q_train, self.q_update, self.u_update, self.q_debug = l else: self.q_train, self.q_update, self.q_debug = l self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) obs_n = [obs] #+[np.zeros_like(obs)]*(self.n-1) obs_next_n = [obs_next] #+[np.zeros_like(obs_next)]*(self.n-1) act_n = [act] #+[np.zeros_like(act)]*(self.n-1) # train q network num_sample = 1 target_q = 0.0 target_u = 0.0 for i in range(num_sample): t = self.p_debug['target_act'](obs_next_n[0]) target_act_next_n = [t] # + [np.zeros_like(t)]*(self.n-1) #print('target_act_next_n ', np.asarray(target_act_next_n).shape) #print('obs_next_n', len(obs_next_n), obs_next_n[0].shape) target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) if self.u_estimation: target_u_next = self.q_debug['target_u_values']( *(obs_next_n + target_act_next_n)) target_u += math.pow(self.args.gamma, 2.0) * (1.0 - done) * target_u_next target_q += self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample if self.u_estimation: q_loss, u_loss = self.q_train(*(obs_n + act_n + [target_q] + [target_u] + [rew])) else: q_loss = self.q_train(*(obs_n + act_n + [target_q] + [rew])) var_rew = np.array(self.q_debug['var'](*(obs_n + act_n + [target_q] + [rew]))).mean() # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() if self.u_estimation: self.u_update() return [ np.asarray(q_loss).mean(), np.asarray(p_loss).mean(), np.mean(target_q), np.mean(rew), var_rew, np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, actor_lr=None, critic_lr=None, gamma=None, num_units=None, rb_size=None, batch_size=None, max_episode_len=None, clip_norm=0.5, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # training parameters self.actor_lr = actor_lr if actor_lr else args.lr self.critic_lr = critic_lr if critic_lr else args.lr self.gamma = gamma if gamma else args.gamma self.num_units = num_units if num_units else args.num_units self.rb_size = rb_size if rb_size else args.rb_size self.batch_size = batch_size if batch_size else args.batch_size self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len self.clip_norm = clip_norm # TODO: remove after testing import models.config as Config assert actor_lr == Config.maddpg_train_args['actor_lr'] assert critic_lr == Config.maddpg_train_args['critic_lr'] assert gamma == Config.maddpg_train_args['gamma'] assert num_units == Config.maddpg_train_args['num_hidden'] assert rb_size == Config.maddpg_train_args['rb_size'] assert batch_size == Config.maddpg_train_args['batch_size'] assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps'] assert clip_norm == Config.maddpg_train_args['clip_norm'] obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(self.rb_size) self.max_replay_buffer_len = self.batch_size * self.max_episode_len self.replay_sample_index = None self.loss_names = [ 'q_loss', 'p_loss', 'mean_target_q', 'mean_rew', 'mean_target_q_next', 'std_target_q' ] def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 act_space = act.shape[-1] target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] # flatten multi agent actions and observations act_serial_vals = self.q_debug['act_serial_values']( *(target_act_next_n)) obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_next_n)) assert len(act_serial_vals) == self.batch_size assert len(obs_serial_vals) == self.batch_size # compute L2 normalized partial derivatives of target Q function wrt actions # NOTE: this is done one sample at a time to prevent tf.gradient from summing over all target q values grad_norm_value = [ self.q_debug['grad_norm_value'](*([[obs_serial_vals[j]]] + [[act_serial_vals[j]]])) for j in range(self.batch_size) ] assert len(grad_norm_value) == self.batch_size # scale the raw gradients by alpha # TODO: set alpha during init or compute as function of policy or loss perturb = np.array(grad_norm_value) * 0.01 # update leader actions using gradients for b in range(self.batch_size): # find all the leaders wrt current agent (agent_index) leading_agents = [ [1.0] * act_space if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2] else [0.0] * act_space for k in range(self.n) ] # filter perturbations to only apply for leading agents # scale by L2 norm of original actions to prevent the perturb from overwhelming action epsilon = perturb[b].flatten() * np.array( leading_agents).flatten() * np.linalg.norm( act_serial_vals[b], 2) act_serial_vals[b] += epsilon # target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next = self.q_debug['target_q_values']( *([obs_serial_vals] + [act_serial_vals])) target_q += rew + self.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # get current actions and observations flattened act_serial_vals = self.q_debug['act_serial_values'](*(act_n)) obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_n)) # compute L2 normalized partial derivatives of Q function wrt actions grad_norm_value = [ self.p_debug['grad_norm_value'](*([[obs_serial_vals[j]]] + [[act_serial_vals[j]]])) for j in range(self.batch_size) ] assert len(grad_norm_value) == self.batch_size # scale the raw gradients by alpha perturb = np.array(grad_norm_value) * 0.01 # update leader actions using these perturbations for b in range(self.batch_size): # find all the leaders wrt current agent (agent_index) leading_agents = [ [1.0] * act_space if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2] else [0.0] * act_space for k in range(self.n) ] # filter perturbations to only apply for leading agents epsilon = perturb[b].flatten() * np.array(leading_agents).flatten( ) * np.linalg.norm(act_serial_vals[b], 2) epsilon_n = [ epsilon[k * act_space:(k * act_space) + act_space] for k in range(self.n) ] # update each agent action for current batch sample "b" for k in range(self.n): act_n[k][b] += epsilon_n[k] # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, safety_layer=None, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.safety_layer = safety_layer obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs, c=None, env=None): action = self.act(obs[None])[0] if_call = False return action, if_call def action_real(self, obs, c=None, env=None): # get action from DDPG action = self.act(obs[None])[0] action_real = action if_call = False dist = np.sqrt( np.sum( np.square(env.agents[0].state.p_pos - env.world.landmarks[-1].state.p_pos))) # call for the safety_layer if self.safety_layer and c is not None and env is not None and dist > 1.5: # judge the collision in future 10 steps collision_flag = False env_future = copy.deepcopy(env) obs_future = copy.deepcopy(obs) trajectory = np.zeros([4, self.safety_layer.UAV_config.N + 1]) trajectory[0, 0] = obs_future[2] trajectory[1, 0] = obs_future[3] trajectory[2, 0] = obs_future[4] trajectory[3, 0] = obs_future[5] for i in range(self.safety_layer.UAV_config.N): action_future = [self.act(obs_future[None])[0]] # environment step new_obs_n, rew_n, done_n, info_n = env_future.step( action_future) is_any_collision = [] for agent in env_future.agents: temp = False for _, landmark in enumerate( env_future.world.landmarks[0:-1]): dist = np.sqrt(np.sum(np.square(agent.state.p_pos - landmark.state.p_pos))) \ - (agent.size + landmark.size) if dist <= 0: temp = True is_any_collision.append(temp) if is_any_collision[0]: collision_flag = True done_future = all(done_n) if done_future: break obs_future = new_obs_n[0] trajectory[0, i + 1] = obs_future[2] trajectory[1, i + 1] = obs_future[3] trajectory[2, i + 1] = obs_future[4] trajectory[3, i + 1] = obs_future[5] if not collision_flag: return action_real, action, if_call action, if_call = self.safety_layer.get_safe_action( obs, action, trajectory) return action_real, action, if_call def set_safety_layer(self, safety_layer): self.safety_layer = safety_layer def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, env, name, model, CNN_model, obs_shape_n, obs_map_shape_n,act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_map_ph_n=[] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) obs_map_ph_n.append(U.BatchInput(obs_map_shape_n[i], name="observation_map"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n ) self.act, self.p_train, self.vf_t, self.p_update, self.vf_u, self.p_debug = p_train( scope=self.name, env = env, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, vf_func=model, shana = GMMPolicy, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.batch_size=args.batch_size def action(self, obs): return self.act([obs[0]],[obs[1]])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents # obs_n = [] # obs_next_n = [] # act_n = [] # index = self.replay_sample_index # for i in range(self.n): # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) # obs_n.append(obs) # obs_next_n.append(obs_next) # act_n.append(act) # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) obs_n = [] obs_map_n=[] obs_next_map=[] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) #pdb.set_trace() obs_n.append(obs[:,0].tolist()) obs_next_n.append(obs_next[:,0].tolist()) obs_map_n.append(obs[:,1].tolist()) obs_next_map.append(obs_next[:,1].tolist()) act_n.append(act) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): #current_target_act_n = [agents[i].p_debug['target_act'](obs_n[i]) for i in range(self.n)] current_target_act_n = [np.array([np.reshape(np.array(agents[i].p_debug['target_act']([obs_n[i][j]],[obs_map_n[i][j]])),-1) for j in range(self.batch_size)]) for i in range(self.n)] target_vf_next = self.q_debug['target_vf_values'](*(obs_next_n+obs_next_map)) target_q += rew + self.args.gamma * (1.0 - done) * target_vf_next target_q /= num_sample #pdb.set_trace() q_loss = self.q_train(*(obs_n + obs_map_n+act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n+obs_map_n)) vf_loss = self.vf_t(*(obs_n + current_target_act_n+obs_map_n)) self.p_update() self.q_update() self.vf_u() return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_vf_next), np.std(target_q)]
class MADDPGAgentTrainerIndepLearner(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type, local_q_func=False): self.name = name self.n = 1 self.agent_index = agent_index self.args = args self.u_estimation = args.u_estimation self.constrained = args.constrained self.constraint_type = args.constraint_type self.agent_type = agent_type if self.agent_type == "good": cvar_alpha = args.cvar_alpha_good_agent elif self.agent_type == "adversary": cvar_alpha = args.cvar_alpha_adv_agent obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) # Create all the functions necessary to train the model self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic), optimizer_lamda=tf.train.AdamOptimizer( learning_rate=args.lr_lamda), exp_var_alpha=args.exp_var_alpha, cvar_alpha=cvar_alpha, cvar_beta=args.cvar_beta, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation, constrained=self.constrained, constraint_type=self.constraint_type, agent_type=self.agent_type) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t, frozen=False): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) # train q network num_sample = 1 target_q = 0.0 if self.u_estimation: target_u = 0.0 for i in range(num_sample): target_act_next_n = [ self.p_debug['target_act'](obs_next_n[0]) ] # WHY IS THIS ON AGENT[0]'s target_act ???????? target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q = self.args.gamma * (1.0 - done) * target_q_next if self.u_estimation: target_u_next = self.q_debug['target_u_values']( *(obs_next_n + target_act_next_n)) target_u = math.pow(self.args.gamma, 2.0) * (1.0 - done) * target_u_next #rew += (rew - self.lamda_constraint*(var_rew - self.args.alpha)) target_q /= num_sample if self.u_estimation: target_u /= num_sample if not frozen: if self.u_estimation: q_loss, u_loss = self.q_train(*(obs_n + act_n + [target_q] + [target_u] + [rew])) if self.constrained: q_loss2 = self.q_train2(*(obs_n + act_n + [target_q] + [target_u] + [rew])) else: q_loss = self.q_train(*(obs_n + act_n + [target_q] + [rew])) if self.constrained: q_loss2 = self.q_train2(*(obs_n + act_n + [target_q] + [rew])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() if self.u_estimation: self.u_update() if update_v_constraint_only and not self.constrained: v_constraint_loss = self.q_train3(*(obs_n + act_n + [target_q] + [rew])) else: v_constraint_loss = 0.0 if self.constrained: lamda_constraint = np.array( self.q_debug['lamda_constraint'].eval()).mean() if lamda_constraint <= 0: print("Value of Lamda violated", lamda_constraint) else: lamda_constraint = 0.0 if self.constraint_type == "CVAR": v_constraint = np.array(self.q_debug['v_constraint'].eval()).mean() else: v_constraint = 0.0 if self.u_estimation: var_rew = np.array(self.q_debug['var']( *(obs_n + act_n + [target_q] + [target_u] + [rew]))).mean() else: var_rew = np.array(self.q_debug['var']( *(obs_n + act_n + [target_q] + [rew]))).mean() if self.constrained and self.constraint_type == "CVAR": cvar = np.array(self.q_debug['cvar'](*(obs_n + act_n + [target_q] + [rew]))).mean() else: cvar = 0.0 if not frozen: q_loss_mean = np.asarray(q_loss).mean() if self.u_estimation: u_loss_mean = np.asarray(u_loss).mean() else: u_loss_mean = 0.0 p_loss_mean = np.asarray(p_loss).mean() if self.constrained: q_loss2_mean = np.asarray(q_loss2).mean() else: q_loss2_mean = 0.0 else: q_loss_mean = 0.0 u_loss_mean = 0.0 p_loss_mean = 0.0 q_loss2_mean = 0.0 q_values = np.asarray(self.q_debug['q_values'](*(obs_n + act_n))) #print ('q_values', q_values.shape) mean_q_values = np.mean(q_values) std_q_values = np.std(q_values) return [ q_loss_mean, u_loss_mean, q_loss2_mean, p_loss_mean, np.mean(rew), var_rew, cvar, lamda_constraint, v_constraint, mean_q_values, std_q_values ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type="good", local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) if (agent_type == "good"): self.mic = float(args.good_mic) else: self.mic = float(args.adv_mic) print("MIC for ", agent_type, " agent is ", self.mic) self.agent_type = agent_type # make a multivariate for each agent. self.multivariate_mean = None self.multivariate_cov = None self.margian_aprox_lr = 1e-2 self.action_history = [] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), mut_inf_coef=self.mic, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), mut_inf_coef=self.mic, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def sleep_regimen(self): return self.args.sleep_regimen def agent_mic(self): return self.mic def action(self, obs): action = self.act(obs[None])[0] if (len(self.replay_buffer) > self.max_replay_buffer_len ): # dont add random actions to action history self.action_history.append(action) if (self.mic > 0 and len(self.action_history) >= 100): actions = np.stack(self.action_history) act_mu = actions.mean(axis=0) act_std = actions.std(axis=0) if (self.multivariate_mean is None): self.multivariate_mean = act_mu else: previous_mean = self.multivariate_mean self.multivariate_mean = ( (1 - self.margian_aprox_lr) * self.multivariate_mean) + (self.margian_aprox_lr * act_mu) if (self.multivariate_cov is None): self.multivariate_cov = np.diag(act_std) else: cov = (self.margian_aprox_lr * np.diag(act_std) + (1 - self.margian_aprox_lr) * self.multivariate_cov) mom_1 = (self.margian_aprox_lr * np.square(np.diag(act_mu))) + ( (1 - self.margian_aprox_lr) * np.square(np.diag(previous_mean))) mom_2 = np.square((self.margian_aprox_lr * np.diag(act_mu)) + (1 - self.margian_aprox_lr) * np.diag(previous_mean)) self.multivariate_cov = cov + mom_1 - mom_2 if (len(self.action_history) > 100): self.action_history.pop(0) return action def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t, sleeping=False): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) mir_penalty = 0 if (self.mic > 0 and (not self.args.sleep_regimen or (self.args.sleep_regimen and sleeping)) ): # If sleep regimen is on, only use mic when sleeping try: multivar = multivariate_normal(self.multivariate_mean, self.multivariate_cov) logp_phi = multivar.logpdf(act) logp_phi = logp_phi.reshape(self.args.batch_size, ) p_phi = multivar.pdf(act) p_phi = p_phi.reshape(self.args.batch_size, ) action_mean = np.mean(act, axis=0) action_std = np.std(act, axis=0) action_cov = np.diag(action_std) policy_multivar = multivariate_normal(action_mean, action_cov) logp_pi = policy_multivar.logpdf(act) logp_pi = logp_pi.reshape(self.args.batch_size, ) p_pi = policy_multivar.pdf(act) p_pi = p_pi.reshape(self.args.batch_size, ) phi_entropy = -1 * np.sum(logp_phi * p_phi) pi_entropy = -1 * np.sum(logp_pi * p_pi) mir_penalty = self.mic * (phi_entropy - pi_entropy) except: mir_penalty = 0 print(mir_penalty) num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += (rew - mir_penalty ) + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample print(target_q) assert (False) q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_mems = [] for i in range(args.num_groups): # assumes agents have same observation shape obs_ph_mems.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_mems, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, num_groups=args.num_groups) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_mems, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, num_groups=args.num_groups) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal, emergency_score, group_members): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done), emergency_score, group_members) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] emerg_n = [] mems_n = [] index = self.replay_sample_index obs, act, rew, obs_next, done, emerg, mems = self.replay_buffer.sample_index( index) for i in range(self.n): obs_i, act_i, _, obs_next_i, _, emerg_i, mems_i = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs_i) obs_next_n.append(obs_next_i) act_n.append(act_i) emerg_n.append(emerg_i) mems_n.append(mems_i) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] # 9*1024*(act_size) obs_next_mems = [] target_act_next_mems = [] for t, group_members in enumerate(mems): # potential performance optimization here curr_obs_next_mems = [obs_next_n[self.agent_index][t]] curr_act_next_mems = [target_act_next_n[self.agent_index][t] ] # 3*(obs_size) for i in group_members: if i == self.agent_index: continue curr_obs_next_mems.append(obs_next_n[i][t]) curr_act_next_mems.append(target_act_next_n[i][t]) obs_next_mems.append(curr_obs_next_mems) target_act_next_mems.append(curr_act_next_mems) target_act_next_mems = np.swapaxes(target_act_next_mems, 0, 1) obs_next_mems = np.swapaxes(obs_next_mems, 0, 1) input_list = list(obs_next_mems) + list(target_act_next_mems) target_q_next = self.q_debug['target_q_values'](*input_list) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next """ target_act_next_mems = [] for t, group_members in enumerate(mems): # potential performance optimization here target_act_next_mems.append([target_act_next_n[i][t] for i in group_members]) """ """ ### OLD CODE ### target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next """ target_q /= num_sample act_mems = [] obs_mems = [] for t, group_members in enumerate(mems): # potential performance optimization here curr_obs_mems = [obs_n[i][t] for i in group_members] curr_act_mems = [act_n[i][t] for i in group_members] obs_mems.append(curr_obs_mems) act_mems.append(curr_act_mems) act_mems = np.swapaxes(act_mems, 0, 1) obs_mems = np.swapaxes(obs_mems, 0, 1) input_list = list(obs_mems) + list(act_mems) + [target_q] q_loss = self.q_train(*input_list) # train p network input_list = list(obs_mems) + list(act_mems) p_loss = self.p_train(*input_list) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(): """Train MADDPG Agent. The vast majority of the modifications to this class (as well as other parts of this file) are drawn from https://github.com/sunshineclt/maddpg/blob/master/maddpg/trainer/maddpg.py. """ def __init__(self, name, model_value, model_policy, obs_shape_n, act_space_n, agent_index, args, hparams, summary_writer=None, local_q_func=False, rngseed=None): self.name = name self.rngseed = rngseed self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.hparams = hparams obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput( obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # train critic self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # train policy self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model_policy, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed) try: if hparams['test_saving']: self.max_replay_buffer_len = 100 except KeyError: self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len self.replay_sample_index = None self.summary_writer = summary_writer def action(self, obs): # return self.act(obs[None])[0] theac = self.act(obs[None])[0] # print("p", self.p_debug["p_values"](obs[None])[0]) # print("act", self.act(obs[None])[0]) if any(np.isnan(theac)): print('NaN action in MADDPGAgentTrainer') pdb.set_trace() print('NaN action in MADDPGAgentTrainer') return theac def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def set_memory_index(self, replay_sample_index): self.replay_sample_index = replay_sample_index def get_memory_index(self, batch_size): return self.replay_buffer.make_index(batch_size) def get_replay_data(self): return self.replay_buffer.sample_index(self.replay_sample_index) def get_target_act(self, obs): return self.p_debug['target_act'](obs[self.agent_index]) def update(self, agents, t, episodenum, savestuff=False): """Pull from replay buffer and update policy and critic.""" # replay buffer is not large enough if len(self.replay_buffer) < self.max_replay_buffer_len: return False, [] if not t % 100 == 0: # only update every 100 steps return False, [] self.replay_sample_index = \ self.replay_buffer.make_index(self.hparams['batch_size']) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = \ agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train Q-function network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = \ [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q += rew + self.hparams['gamma'] * (1.0 - done) * target_q_next target_q /= float(num_sample) q_loss, q_loss_summary = self.q_train(*(obs_n + act_n + [target_q])) if q_loss > 10000000: print('Huge Q loss! Seed was {}'.format(self.rngseed)) pdb.set_trace() print('Huge Q loss! Seed was {}'.format(self.rngseed)) # train policy network p_loss, p_summary = self.p_train(*(obs_n + act_n)) if p_loss > 10000000: print('Huge policy loss! Seed was {}'.format(self.rngseed)) pdb.set_trace() print('Huge policy loss! Seed was {}'.format(self.rngseed)) if self.summary_writer is not None and savestuff: self.summary_writer.add_summary(p_summary, global_step=episodenum) self.summary_writer.add_summary(q_loss_summary, global_step=episodenum) self.p_update() # update policy self.q_update() # update critic return True, [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
class MADDPGAgentTrainer(AgentTrainer): """ Agent Trainer using MADDPG Algorithm """ def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, role="", local_q_func=False): """ Args: name (str): Name of the agent model (function): MLP Neural Network model for the agent. obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents agent_index (int): Agent index number args (argparse.Namespace): Parsed commandline arguments object role (str): Role of the agent i.e. adversary local_q_func (boolean): Flag for using local q function """ # super(MADDPGAgentTrainer, self).__init__() self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # Set up observation space placeholder obs_ph_n = [] for i in range(self.n): obs_ph_n.append( tf_util.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(int(1e6)) self.max_replay_buffer_len = 30 # args.batch_size * args.max_episode_len TODO: Change back self.replay_sample_index = None def action(self, obs): """ Retrieves action for agent from the P network given the observations Args: obs (np.array): Observations of the world for an agent Returns: Action for an agent """ return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): """ Store transition in the replay buffer. Args: obs (np.array): Observations of the world for an agent act (list): Action for an agent rew (float): Reward for an agent new_obs (np.array): New observations of the world for an agent done (): Done for an agent terminal (boolean): Flag for whether the final episode has been reached. """ self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): """ Reset replay_sample_index to None. """ self.replay_sample_index = None def update(self, agents, steps): """ Update agent networks Args: agents (list): List of MADDPGAgentTrainer objects steps (int): Current training step Returns: (list) Training loss for the agents [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q] """ # Replay buffer is not large enough if len(self.replay_buffer) < self.max_replay_buffer_len: return # Only update every 100 steps if not steps % 100 == 0: return # Collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) self_index = self.replay_sample_index for i in range(self.n): index = agents[i].replay_buffer.make_index(self.args.batch_size) obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index( self_index) # Train Q Network num_sample = 1 target_q = 0.0 target_q_next = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # Train P Network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions, obs_ph_n[0].shape[1]) #self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.max_replay_buffer_len = args.batch_size # I mean this is how it should be. This is what we're actually doing... self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t, replay_index=None): single_nn = replay_index != None if single_nn: assert self.agent_index == 0 else: replay_index = self.agent_index if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents # This is silly. We only need to do this once per step, not for each agent. That is true also when we have multiple nn. obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): j = (i + replay_index) % self.n if single_nn else i obs, act, rew, obs_next, done = agents[ j].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = agents[ replay_index].replay_buffer.sample_index(index) # train q network. I don't understand how this matters. Where do we use the q-network???? # we have a separate q-network in p_train are they connected because they share the same name in tf?? num_sample = 1 target_q = 0.0 for i in range(num_sample): if single_nn: target_act_next_n = [ self.p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] else: target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class IBMACAgentTrainer(AgentTrainer): def __init__(self, name, before_com_model, channel, after_com_model, critic_mlp_model, obs_shape_n, act_space_n, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, before_com_func=before_com_model, channel=channel, after_com_func=after_com_model, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, beta=args.beta, ibmac_com=args.ibmac_com, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) # self.max_replay_buffer_len = 50 * args.max_episode_len self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.message_1_for_record = [] def action(self, obs_n, is_norm_training=False, is_inference=False): obs = [obs[None] for obs in obs_n] message_n = self.p_debug['check_message_n']( *(list(obs) + [is_norm_training, is_inference])) self.message_1_for_record.append(message_n[0]) if len(self.message_1_for_record) % 2500 == 0: # print(np.var(self.message_1_for_record, axis=0)) # print(0.5 * np.log(2 * np.pi * np.mean(np.var(self.message_1_for_record, axis=0))) + 0.5) self.message_1_for_record = [] return self.act(*(list(obs) + [is_norm_training, is_inference])) def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, [float(d) for d in done]) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return is_norm_training = True is_inference = False self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index samples = self.replay_buffer.sample_index(index) obs_n, act_n, rew_n, obs_next_n, done_n = [ np.swapaxes(item, 0, 1) for item in samples ] # for i in range(self.n): # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) # obs_n.append(obs) # obs_next_n.append(obs_next) # act_n.append(act) # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 # print(len(obs_next_n)) for i in range(num_sample): target_act_next_n = self.p_debug['target_act']( *(list(obs_next_n) + [is_norm_training, is_inference])) target_q_next_n = self.q_debug['target_q_values']( *(list(obs_next_n) + list(target_act_next_n) + [is_norm_training, is_inference])) target_q_n = [ rew + self.args.gamma * (1.0 - done) * target_q_next for rew, done, target_q_next in zip(rew_n, done_n, target_q_next_n) ] target_q_n = [target_q / num_sample for target_q in target_q_n] q_loss = self.q_train(*(list(obs_n) + list(act_n) + target_q_n + [is_norm_training, is_inference])) # train p network p_loss = self.p_train(*(list(obs_n) + list(act_n) + [is_norm_training, is_inference])) self.p_update() self.q_update() # p_values = self.p_debug['p_values'](*(list(obs_n))) kl_loss = self.p_debug['kl_loss'](*(list(obs_n) + list(act_n) + [is_norm_training, is_inference])) # print('kl_loss', self.p_debug['kl_loss'](*(list(obs_n) + list(act_n)))) # if t % 5000 == 0: # print('p_values', p_values[0][0]) # print('check_value', self.p_debug['p_values'](*(list(obs_n)))[0][0]) # print('check_mu', self.p_debug['check_mu'](*(list(obs_n)))[0][0]) # print('check_log', self.p_debug['check_log'](*(list(obs_n)))[0][0]) # print('kl_loss', kl_loss) # message_n = self.p_debug['check_message_n'](*(list(obs_n)+[is_norm_training, is_inference])) # hiddens_n = self.p_debug['check_hiddens_n'](*list(obs_n)) # print("message_n", message_n[0][0]) # for message in message_n: # print("mean, var", np.mean(message, axis=0), np.var(message,axis=0)) # print("hiddens_n", hiddens_n[0][0]) # entropy = self.p_debug['check_entropy'](*list(obs_n)) # print("entropy",np.mean(entropy, (1,2))) return [ q_loss, p_loss, np.mean(target_q), np.mean(rew_n), np.mean(target_q_next_n), np.std(target_q), kl_loss ]
class COMAAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, action_number, args): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, num_units=args.num_units, num_outputs=action_number) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=False, num_units=args.num_units, num_outputs=action_number) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def get_inputs(self): pass def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index # 每个agent获得replay batch for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_picked = [ softmax_act.tolist().index(max(softmax_act)) for softmax_act in act ] act_n.append(act_picked) # 当前trainer的replay batch obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network # 向后看1步,相当于TD1? num_sample = 1 target_q = 0.0 for i in range(num_sample): # 获得下一step的所有agent动作,每个agent根据 本地 的下一step的obs做决策 target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] act_picked = [] for i in range(self.n): act_picked += [ softmax_act.tolist().index(max(softmax_act)) for softmax_act in target_act_next_n[i] ] # 利用target网络得到target q值 target_q_next = self.q_debug['target_q_values'](*(obs_next_n + act_picked)) # Q network得到的是当前用户取不同动作的q,计算loss需要得到真实动作下的Q target_q_picked_next = [ q[act] for act, q in zip(act_picked[self.agent_index], target_q_next) ] target_q += rew + self.args.gamma * (1.0 - done) * target_q_picked_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGApproxAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, use_approx_policy = True, sync_replay = True, local_q_func=False, update_gap=100): self.use_approx_policy = use_approx_policy self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index args.num_units = 64 self.sync_replay = sync_replay self.counter = 0 self.args = args self.update_gap = update_gap obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_sync, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_sync, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.approx_act, self.approx_p_train, self.approx_p_update, self.approx_p_sync, self.approx_p_debug = [],[],[],[],[] for i in range(self.n): if i == self.agent_index: t_act, t_p_train, t_p_update, t_p_sync, t_p_debug = self.act, self.p_train, self.p_update, self.p_sync, self.p_debug else: t_act, t_p_train, t_p_update, t_p_sync, t_p_debug = p_approx_train( scope=self.name+'approx_p_%d'%i, make_obs_ph_n=obs_ph_n, # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, p_index=i, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.approx_act.append(t_act) self.approx_p_train.append(t_p_train) self.approx_p_update.append(t_p_update) self.approx_p_sync.append(t_p_sync) self.approx_p_debug.append(t_p_debug) # Create experience buffer self.replay_buffer = ReplayBuffer(int(1e6)) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] # return self.p_debug['target_act'](obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def sync_target_nets(self): for i in range(self.n): self.approx_p_sync[i]() self.q_sync() def preupdate(self): self.replay_sample_index = None self.counter += 1 def update(self, agents): # replay buffer is not large enough if len(self.replay_buffer) < self.max_replay_buffer_len: return None if not self.counter % self.update_gap == 0: return None # agree on a replay samples across all agents # as in https://arxiv.org/abs/1703.06182 if self.sync_replay: if agents[0].replay_sample_index is None: agents[0].replay_sample_index = agents[0].replay_buffer.make_index(agents[0].args.batch_size) self.replay_sample_index = agents[0].replay_sample_index else: self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # evaluate kl divergence between approximate policy and the target policy target_logits_n = [agents[i].p_debug['target_p_values'](obs_n[i]) for i in range(self.n)] kl_loss = 0.0 for i in range(self.n): if i == self.agent_index: continue kl_loss += self.approx_p_debug[i]['kl_loss'](obs_n[i], target_logits_n[i]) # collect latest samples for approximate policy latest_obs_n = [] latest_act_n = [] latest_index = self.replay_buffer.make_latest_index(self.update_gap) for i in range(self.n): # TODO: now we approximate the *true policy*, but what we want is actually the target_policy! # Shall we approximate the target net instead??? t_obs, t_act, _, _, _ = agents[i].replay_buffer.sample_index(latest_index) #t_act = agents[i].p_debug['target_act'](t_obs) latest_obs_n.append(t_obs) latest_act_n.append(t_act) # train approximate p network for i in range(self.n): if i == self.agent_index: continue self.approx_p_train[i](*(latest_obs_n + latest_act_n)) self.approx_p_update[i]() # train q network if self.use_approx_policy: target_act_next_n = [self.approx_p_debug[i]['target_act'](obs_next_n[i]) for i in range(self.n)] else: # use true policy target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q = rew + self.args.gamma * (1.0 - done) * target_q_next q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [q_loss, p_loss, kl_loss]
class MADDPGAgentTrainer(AgentTrainer): # model是采用的神经网络模型的输出,即神经网络模型 def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): # 是否用ddpg训练 self.name = name self.n = len(obs_shape_n) # 总的agent个数 self.agent_index = agent_index # 当前是几号agent self.args = args # cmd传入的训练参数,交互用 obs_ph_n = [] for i in range(self.n): # 用于一批环境数据放入的占位符集合,收集所有agent的observations, # 依据他们observation的shape创造不同大小的批量占位符集合 obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # 训练节点,更新target网络,字典得到对应输出的q值与target-q值(已经被session激活) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # 得到act,训练策略网络,策略网络的target网络更新,字典给出p值和target策略网络的输出动作值 self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): # 选择动作 return self.act(obs[None])[0] # 返回的是一组act中的第一个,[None]表示数组的一维 def experience(self, obs, act, rew, new_obs, done, terminal): # 为该agent收集经验 # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None # batch大小数组置空 def update(self, agents, t): # 经验回放训练该agent,100步才训练,train step为100倍数 if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # 一个batch大小的数组,存放随机生成的index # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index # index数组 for i in range(self.n): # 从每个agent的经验池采样 obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) # 采样一个agent的一批经验 obs_n.append(obs) # n批经验构成的二维数组,每个元素为一个agent的一次对环境的观察,下同理 obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index( index) # 采样自己的经验,和上面的采样相同 # 最后obs_n中有n个元素,每个元素表示一个agent的一个batch_size大小的经验集合 # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): # debug是一个字典,值是函数 target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] # target网络预测的所有agent的下一个行为!!!!!!!!!每个agent维护一个用于预测其余agent动作的神经网络 # 这里用于选择自己的policy的神经网络与预测别人行为的网络是同一个 target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) # target网络预测的下一个状态的所有agent的q值集合!!!!!!!!!! target_q += rew + self.args.gamma * (1.0 - done) * target_q_next # 上式为target-q值, target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # 仅用当前经验完成对q值的训练 # train p network p_loss = self.p_train(*(obs_n + act_n)) # 填入所有观测值是因为用于q网络对p网络的改善,选择行为时实际上只用p网络 self.p_update( ) # 每一次完成神经网络训练后softupdate target网络的值,其实是100步才会执行一次update self.q_update() # 同上 return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(): def __init__(self, name, model_value, model_policy, obs_shape_n, act_space_n, agent_index, args, board_writer, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model_policy, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size_q * args.max_episode_len self.replay_sample_index = None self.board_writer = board_writer def action(self, obs): # print("p", self.p_debug["p_values"](obs[None])[0]) # print("act", self.act(obs[None])[0]) return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def set_memory_index(self, replay_sample_index): self.replay_sample_index = replay_sample_index def get_memory_index(self, batch_size): return self.replay_buffer.make_index(batch_size) def get_replay_data(self): return self.replay_buffer.sample_index(self.replay_sample_index) def get_target_act(self, obs): return self.p_debug['target_act'](obs[self.agent_index]) def update_q(self, t, obs_n, act_n, obs_next_n, target_act_next_n): obs, act, rew, obs_next, done = self.replay_buffer.sample_index( self.replay_sample_index) # train q network target_q = 0.0 target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next q_loss, q_loss_summary = self.q_train(*(obs_n + act_n + [target_q])) self.board_writer.add_summary(q_loss_summary, global_step=t) self.q_update() def update_p(self, t, obs_n, target_act_next_n): # train p network p_loss, p_summary = self.p_train(*(obs_n + target_act_next_n)) self.board_writer.add_summary(p_summary, global_step=t) self.p_update()
class MATD3AgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train1, self.q_update1, self.q_debug1 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_function_idx=1, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.q_train2, self.q_update2, self.q_debug2 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_func=model, q_function_idx=2, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, p_func=model, q_func=model, #MLPmodel() optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) a.flush() a.close() def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None @property def q_debug(self): return self.q_debug1 def update(self, agents, train_step): if len( self.replay_buffer ) < self.min_replay_buffer_len: # replay buffer is not large enough return if not train_step % self.args.update_rate == 0: return self.replay_sample_index = self.replay_buffer.generate_sample_indices( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] if self.args.use_critic_noise: for agent_idx in range(self.n): noise = np.random.normal( 0, self.args.critic_action_noise_stddev, size=target_act_next_n[agent_idx].shape) clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] + clipped_noise).tolist() elif self.args.use_critic_noise_self: noise = np.random.normal( 0, self.args.critic_action_noise_stddev, size=target_act_next_n[self.agent_index].shape) clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) target_act_next_n[self.agent_index] = target_act_next_n[ self.agent_index] + clipped_noise target_act_next_n = target_act_next_n.tolist() else: target_act_next_n = target_act_next_n target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next = np.min([target_q_next1, target_q_next2], 0) if self.args.critic_zero_if_done: done_cond = done == True target_q_next[done_cond] = 0 target_q = rew + self.args.gamma * target_q_next q_loss = self.q_train1(*(obs_n + act_n + [target_q])) q_loss = self.q_train2(*(obs_n + act_n + [target_q])) # train p network if train_step % (self.args.update_rate * self.args.policy_update_rate) == 0: p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update1() self.q_update2() # print('Agent' + str(self.agent_index) + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss)) # print('Replay buffer size:' + str(len(self.replay_buffer))) return [ q_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name # name of the agent self.n = len(obs_shape_n) # number of agents self.agent_index = agent_index # Index of the specific agent self.args = args # Settings of hyper-parameters obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype. # [Create all the functions necessary to train the model] # train: U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) # update_target_q: make_update_exp(q_func_vars, target_q_func_vars) # q_values: U.function(obs_ph_n + act_ph_n, q) # target_q_values: U.function(obs_ph_n + act_ph_n, target_q) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, # String: "agent_1" or "agent_2" or ... make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, # action_space. q_index=agent_index, # Index of the specific agent. q_func=model, # Defined model. optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定 grad_norm_clipping=0.5, # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值 local_q_func=local_q_func, num_units=args.num_units # Hidden layers 隐藏节点数 ) # act: U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # train: U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) # update_target_p: make_update_exp(p_func_vars, target_p_func_vars) # p_values: U.function([obs_ph_n[p_index]], p) # target_act: U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None # Input: agents --> all the trainers # t --> increment global step counter # Output: loss --> [loss of q_train, # loss of p_train, # mean of target_q, # mean of reward, # mean of next target_q, # std of target_q] def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return # Random sample from the replay buffer (Experience replay mechanism) self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # Random sample from the replay_buffer --- return the sample index. # collect replay sample from all agents obs_n = [] # Clearly, 'n' indicates the number of the total agents. (Clear the past memory.) obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): # Fetch the [all agents'] information. obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) # Fetch the observation, action, rewerds, next observation, done from the buffer. obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) # obs_n, obs_next_n, act_n obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # Fetch the [self] information. # train q network [Critic network] num_sample = 1 target_q = 0.0 for i in range(num_sample): # obs_n + act_n + [target_q] ==> q network --> Input is all obversation and all action and the target_q --> Output is value. target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] # 根据observation生成个体下一步的action target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) # 根据observation以及action是计算评价 target_q += rew + self.args.gamma * (1.0 - done) * target_q_next # rewards + gamma*target_q_next*(1-done) 迷之done... target_q /= num_sample # calculate the mean of the target_q q_loss = self.q_train(*(obs_n + act_n + [target_q])) # Training procedure. # train p network [Actor network] p_loss = self.p_train(*(obs_n + act_n)) # obs_n + act_n ==> list 拼接; Policy network --> Input is all obversation and all action --> Output is action. self.p_update() # p_network: make_update_experence self.q_update() # q_network: make_update_experence return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, p_model, q_model, obs_shape_n, act_space_n, num_adversaries, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.args = args self.neighbor_n = 2 self.num_adversaries = num_adversaries adj_n = [] obs_ph_n = [] agent_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) adj_n.append( U.BatchInput([ self.neighbor_n, num_adversaries if i < num_adversaries else (self.n - num_adversaries) ], name="adjacency" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_values, self.target_q_values = q_train( name=self.name, scope=self.name, make_obs_ph_n=obs_ph_n, adj_n=adj_n, act_space_n=act_space_n, num_adversaries=num_adversaries, neighbor_n=self.neighbor_n, q_func=q_model, agent_n=self.n, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_values, self.target_act = p_train( name=self.name, scope=self.name, make_obs_ph_n=obs_ph_n, adj_n=adj_n, act_space_n=act_space_n, neighbor_n=self.neighbor_n, p_index=agent_n, p_func=p_model, q_func=q_model, num_adversaries=self.num_adversaries, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): for _ in range(len(obs)): obs[_] = obs[_][None] return self.act(*obs) def experience(self, obs, act, rew, new_obs, done, adj, new_adj, terminal): # Store transition in the replay buffer. done_int = [float(x) for x in done] self.replay_buffer.add(obs, act, rew, new_obs, done_int, adj, new_adj) def pre_update(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return # collect replay sample from all agents self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) index = self.replay_sample_index obs_n = [] obs_next_n = [] act_n = [] adj_n = [] adj_next_n = [] for i in range(len(agents)): obs_record, act_record, rew_record, obs_next_record, done_record, adj_record, adj_next_record = \ agents[i].replay_buffer.sample_index(index) obs_n.append(obs_record) obs_next_n.append(obs_next_record) act_n.append(act_record) adj_n.append(adj_record) adj_next_n.append(adj_next_record) obs, act, rew, obs_next, done, adj, adj_next = self.replay_buffer.sample_index( index) target_act_next_n = [] target_q_next_input_obs = [] target_q_next_input_act = [] q_input_obs_n = [] q_input_act_n = [] p_input_adj_n = [] for _, agent in enumerate(agents): # traverse every species q_input_obs = [] q_input_act = [] p_input_adj = [] target_act_next_input_obs = [] target_act_next_input_adj = [] for j in range( obs_n[_].shape[1]): # traverse every agent in each species _obs = [] _act = [] _adj = [] _obs_next = [] _adj_next = [] for i in range(self.args.batch_size): # traverse each instance _obs.append(obs_n[_][i][j]) _act.append(act_n[_][i][j]) _adj.append(adj_n[_][i][j]) _obs_next.append(obs_next_n[_][i][j]) _adj_next.append(adj_next_n[_][i][j]) q_input_obs.append(np.array(_obs)) q_input_act.append(np.array(_act)) p_input_adj.append(np.array(_adj)) target_act_next_input_obs.append(np.array(_obs_next)) target_act_next_input_adj.append(np.array(_adj_next)) vec = matlib.repmat([1, 0], self.args.batch_size, 1) vec = np.expand_dims(vec, axis=1) target_act_next_input = target_act_next_input_obs + target_act_next_input_adj + [ vec ] temp = agent.target_act(*target_act_next_input) target_act_next_n.append(temp) target_q_next_input_obs.extend(target_act_next_input_obs) target_q_next_input_act.extend(temp) q_input_obs_n.extend(q_input_obs) q_input_act_n.extend(q_input_act) p_input_adj_n.extend(p_input_adj) target_q = 0.0 target_q_next = self.target_q_values(*(target_q_next_input_obs + target_q_next_input_act)) #rew = np.sum(rew, 1) / 4 # used to be (1 - done) but actually what's 'done' is not defined in "simple-world-comm" scenario, # thus should be considered again how to define "done" for species target_q_next = np.transpose(np.array(target_q_next)) target_q += rew + self.args.gamma * target_q_next target_q_list = [ target_q.transpose()[i] for i in range(np.shape(target_q)[1]) ] # train the critic network # q_train_input = q_input_obs_n + q_input_act_n + [target_q] q_loss = [ self.q_train[i](*(q_input_obs_n + q_input_act_n + [target_q_list[i]])) for i in range(len(self.q_train)) ] # train the policy network p_loss = self.p_train(*(q_input_obs_n + q_input_act_n + p_input_adj_n + [vec])) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
def train(arglist, PID=None, lock=None): start_time = time.time() # global replay_buffer with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agents networks obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] ####changed by yuan li num_adversaries = copy.deepcopy(env.num_adversaries) arglist.num_adversaries = copy.deepcopy(num_adversaries) if comm_rank != 0 and comm_rank != 1: req = None wait_flag = False actors = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() #var_list = [var for var in tf.trainable_variables()] #加载模型 var_list_n = [] for actor in actors: var_list_n.extend(actor.get_variable_list()) saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables( env) obs_n = env.reset() step = 0 episode_step = 0 sample_number = 0 t_start = time.time() updata_time = 0 print('Starting iterations...') invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 while True: if not wait_flag: #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11) req = comm.irecv(350000, source=0, tag=11) wait_flag = True else: data_recv = req.test() if data_recv[0]: wait_flag = False if data_recv[1] == 'finish': #finish = True comm.send('finish', dest=1, tag=11) break else: update_start = time.time() i = 0 j = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: var.load(data_recv[1][j], sess) j += 1 i += 1 #for var in var_list: # var.load(data_recv[1][i], sess) # i += 1 #print("111111111111111111111111,load param") #for i, actor in enumerate(actors): # actor.load_weights(data_recv[1][i], sess) update_end = time.time() #print("step:{}, rank0_update_end_time:{}".format(step, update_end)) updata_time += (update_end - update_start) step += 1 else: wait_flag = True # get action action_n = [ agent.action(obs) for agent, obs in zip(actors, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 # changed by liyuan done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) ###liyuan: compute the arverage win rate if green_leave_screen(env) or adversary_all_die( env) or adversary_leave_screen(env): terminal = True if adversary_all_die(env): green_win += 1 if green_leave_screen(env): invalid_train += 1 green_leave += 1 if adversary_leave_screen(env): red_leave += 1 if episode_step >= arglist.max_episode_len: for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 50 if adversary_all_die(env): for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 100 if done: red_win = red_win + 1 for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] += 200 rew_n[i] += ( arglist.max_episode_len - episode_step) / arglist.max_episode_len #send data data = [obs_n, action_n, rew_n, new_obs_n, done_n] comm.send(data, dest=1, tag=11) sample_number += 1 #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): if red_win >= 0.8 * arglist.save_rate: temp_dir = arglist.save_dir + "_" + str( len(episode_rewards)) + "_" + str( red_win) + "_{}".format(PID) U.save_state(temp_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), round(time.time() - t_start, 3))) else: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) print( "Rank {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}" .format(comm_rank, red_win, green_win, red_leave, green_leave)) middle_time = time.time() print( "sample_number:{}, train_step:{}, update_time:{}, total_time:{}" .format(sample_number, step, updata_time, middle_time - start_time)) mydata = [] mydata.append(str(len(episode_rewards))) mydata.append( str( np.mean(episode_rewards[-arglist. save_rate:]))) mydata.append( str( np.mean(agent_rewards[0] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[1] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[2] [-arglist.save_rate:]))) mydata.append(str(red_win)) mydata.append( str(round(time.time() - t_start, 3))) out = open('1mydata_{}.csv'.format(comm_rank), 'a', newline='') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(mydata) if len(episode_rewards) > 3000: U.save_state(arglist.save_dir, saver=saver) invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) end_time = time.time() print("rank{}_time:{}".format(comm_rank, end_time - start_time)) print("rank{}_update_time:{}".format(comm_rank, updata_time)) print("rank{}_step:{}".format(comm_rank, step)) if comm_rank == 1: replay_buffer = ReplayBuffer(1e6) wait_flag_1 = False wait_flag_2 = False wait_flag_3 = False req1 = None req2 = None req3 = None sample = 0 step = 0 req_list = [] while True: if not wait_flag_1 or not wait_flag_2 or not wait_flag_3: if not wait_flag_1: req1 = comm.irecv(source=2, tag=11) wait_flag_1 = True if not wait_flag_2: req2 = comm.irecv(source=3, tag=11) wait_flag_2 = True if not wait_flag_3: req3 = comm.irecv(source=4, tag=11) wait_flag_3 = True else: data_recv_1 = req1.test() data_recv_2 = req2.test() data_recv_3 = req3.test() if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]: if data_recv_1[0]: wait_flag_1 = False if data_recv_1[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_2[0]: wait_flag_2 = False if data_recv_2[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_3[0]: wait_flag_3 = False if data_recv_3[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 ''' #计算接收100个样本然后发送样本用的时间 if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len: start = time.time() replay_sample_index = replay_buffer.make_index(arglist.batch_size) send_data = replay_buffer.sample_index(replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11) sample = 0 step += 1 end = time.time() print("rank1 send sample time:", end-start) ''' else: wait_flag_1 = True wait_flag_2 = True wait_flag_3 = True if (sample // 100 > 0) and len( replay_buffer ) >= arglist.batch_size * arglist.max_episode_len: replay_sample_index = replay_buffer.make_index( arglist.batch_size) send_data = replay_buffer.sample_index( replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=0, tag=11) sample = 0 step += 1 end_time = time.time() print("rank1_time:", end_time - start_time) print("rank1_step", step) if comm_rank == 0: extract_time = 0 step = 0 learners = get_agents(env, num_adversaries, obs_shape_n, arglist) var_list_n = [] for learner in learners: var_list_n.extend(learner.get_variable_list()) U.initialize() #var_list = [var for var in tf.trainable_variables()] # 加载模型 saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) while True: if step >= STEP: for i in range(comm_size - 2): comm.send('finish', dest=(i + 2), tag=11) break else: start = time.time() data_recv = comm.recv(source=1, tag=11) for i, agent in enumerate(learners): agent.update(learners, data_recv) #dict_list = [] param = [] extract_start = time.time() i = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: param.append(sess.run(var)) i += 1 #print("2222222222222222 load weights") #for var in var_list: # param.append(sess.run(var)) extract_end = time.time() extract_time += (extract_end - extract_start) for i in range(comm_size - 2): comm.send(param, dest=(i + 2), tag=11) #print("222222222222222222222222,send param") step += 1 end = time.time() #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start)) end_time = time.time() print("rank0_time:", end_time - start_time) print("rank0_extract_time:", extract_time) print("rank0_step:", step)
class MADDPGAgentTrainerFull(AgentTrainer): def __init__(self, name, p_policy, p_predict, q_model, obs_shape_n, act_space_n, state_shape_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.obs_shape = obs_shape_n[agent_index] self.state_shape = state_shape_n[agent_index] self.p_predict = p_predict obs_ph_n = [] obs_next_n = [] obs_pred_n = [] state_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_next_n.append( U.BatchInput(obs_shape_n[i], name="next_obs" + str(i)).get()) obs_pred_n.append( U.BatchInput(obs_shape_n[i], name="pred_obs" + str(i)).get()) state_ph_n.append( U.BatchInput(state_shape_n[i], name="state" + str(i)).get()) # Create all the functions necessary to train the critic net # q_train is used for optimize Q net according to the loss in this batch # q_update is used to update the parameter of target net θ'i = τθi + (1 − τ)θ'i self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=q_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # step return the action and new_state given the obs and state # p_train is used to optimize p Net # p_update is used to update target p net as θ'i = τθi + (1 − τ)θ'i self.step, self.predict, self.p_train, self.p_update, self.p_debug = p_train_recurrent( scope=self.name, make_obs_ph_n=obs_ph_n, make_state_ph_n=state_ph_n, act_space_n=act_space_n, make_obs_next_n=obs_next_n, make_obs_pred_n=obs_pred_n, p_index=agent_index, p_policy=p_policy, p_predict=p_predict, q_func=q_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=tf.AUTO_REUSE) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None ''' def predict(self, act_input, gru_out): with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): obs_pred = self.p_predict(act_input[None], gru_out, int(self.obs_shape[0]), scope="p_predict", num_units=self.args.num_units) return obs_pred def target_predict(self, act_input, gru_out): with tf.variable_scope(self.name, reuse=None): obs_pred = self.p_predict(act_input, gru_out, int(self.obs_shape[0]), scope="target_p_predict", num_units=self.args.num_units) return obs_pred ''' # return the zero state of GRU def p_init_state(self, batch_size): return np.zeros([batch_size, self.state_shape[0]]) def init_pred(self, batch_size): return np.zeros([batch_size, self.obs_shape[0]]) # given the obs and current state, return the action and new state def take_action(self, obs, state, pred): act, new_state, gru_out = self.step(obs[None], state, pred) act = act[0] return act, new_state, gru_out def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t, step_size=16, burn_in_step=8): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return # sample experience self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_seq_n = [] obs_next_seq_n = [] act_seq_n = [] finish_index = self.replay_sample_index for i in range(self.n): obs_seq, act_seq, rew_seq, obs_next_seq, done_seq = agents[ i].replay_buffer.sequence_sample_index(finish_index, step_size) obs_seq_n.append(obs_seq) obs_next_seq_n.append(obs_next_seq) act_seq_n.append(act_seq) obs_seq, act_seq, rew_seq, obs_next_seq, done_seq = self.replay_buffer.sequence_sample_index( finish_index, step_size) state_n = [ agents[i].p_init_state(self.args.batch_size) for i in range(self.n) ] pred_n = [ agents[i].init_pred(self.args.batch_size) for i in range(self.n) ] target_state_n = [ agents[i].p_init_state(self.args.batch_size) for i in range(self.n) ] target_pred_n = [ agents[i].init_pred(self.args.batch_size) for i in range(self.n) ] act_n = [x[0] for x in act_seq_n] temp = [ agents[i].p_debug['target_step'](obs_seq_n[i][0], target_state_n[i], target_pred_n[i]) for i in range(self.n) ] target_state_n = [x[1] for x in temp] target_gru_out_n = [x[2] for x in temp] target_pred_n = [ agents[i].p_debug['target_predict'](act_n[i], target_gru_out_n[i]) for i in range(self.n) ] # burn in stage, don't update the net for step in range(burn_in_step): act_n = [x[step] for x in act_seq_n] act_next_n = [x[step + 1] for x in act_seq_n] # target agent step temp = [ agents[i].p_debug['target_step'](obs_next_seq_n[i][step], target_state_n[i], target_pred_n[i]) for i in range(self.n) ] target_state_n = [x[1] for x in temp] target_gru_out_n = [x[2] for x in temp] target_pred_n = [ agents[i].p_debug['target_predict'](act_next_n[i], target_gru_out_n[i]) for i in range(self.n) ] # agents step temp = [ agents[i].step(obs_seq_n[i][step], state_n[i], pred_n[i]) for i in range(self.n) ] state_n = [x[1] for x in temp] gru_out_n = [x[2] for x in temp] pred_n = [ agents[i].predict(act_n[i], gru_out_n[i]) for i in range(self.n) ] q_loss = 0 p_loss = 0 # update the agents for step in range(burn_in_step, step_size): obs_n = [x[step] for x in obs_seq_n] act_n = [x[step] for x in act_seq_n] if step < (step_size - 1): act_next_n = [x[step + 1] for x in act_seq_n] obs_next_n = [x[step] for x in obs_next_seq_n] # target agents step, get the action in the next step temp = [ agents[i].p_debug['target_step'](obs_next_seq_n[i][step], target_state_n[i], target_pred_n[i]) for i in range(self.n) ] target_act_n = [x[0] for x in temp] target_state_n = [x[1] for x in temp] target_gru_out_n = [x[2] for x in temp] if step < (step_size - 1): target_pred_n = [ agents[i].p_debug['target_predict'](act_next_n[i], target_gru_out_n[i]) for i in range(self.n) ] # infer y from target action target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_n)) target_q = rew_seq[step] + self.args.gamma * ( 1.0 - done_seq[step]) * target_q_next q_loss += self.q_train(*(obs_n + act_n + [target_q])) p_loss += self.p_train(*(obs_n + state_n + act_n + obs_next_n + pred_n)) # agents step state_n = [x[1] for x in temp] gru_out_n = [x[2] for x in temp] pred_n = [ agents[i].predict(act_n[i], gru_out_n[i]) for i in range(self.n) ] # update the target net self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew_seq[step]), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainerCCM(AgentTrainer): """ Agent Trainer using MADDPG Algorithm and CCM """ def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, role="", local_q_func=False): """ Args: name (str): Name of the agent model (function): MLP Neural Network model for the agent. obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents agent_index (int): Agent index number args (argparse.Namespace): Parsed commandline arguments object role (str): Role of the agent i.e. adversary local_q_func (boolean): Flag for using local q function """ super(MADDPGAgentTrainerCCM, self).__init__() self.name = name self.role = role self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_history_ph_n = [] obs_history_ph_n = [] hist = self.args.training_history obs_history_n = [(hist * x[0], ) for x in obs_shape_n] act_history_n = [(hist * act.n, ) for act in act_space_n] # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n] # for act_space in act_space_n: # act_space.n = act_space.n*3 # if act_history_n[0].n != 15: # print("Line 158") for i in range(self.n): obs_ph_n.append( tf_util.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_history_ph_n.append( tf_util.BatchInput(obs_history_n[i], name="observationhistory" + str(i)).get()) act_history_ph_n.append( tf_util.BatchInput(act_history_n[i], name="actionhistory" + str(i)).get()) # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): """ Retrieves action for agent from the P network given the observations Args: obs (np.array): Observations of the world for an agent Returns: Action for an agent """ hist = self.args.training_history if len(self.replay_buffer) > (hist + 1): _, _, _, _, _, obs_h, _, _, _, _ = self.replay_buffer.sample_index( [len(self.replay_buffer)], hist) if len(obs_h) > 0: obs_h = obs_h[0] # obs = np.concatenate((obs,ob[0]),0) else: obs_h = np.array((hist) * list(obs)) return self.act(obs[None], obs_h[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): """ Store transition in the replay buffer. Args: obs (np.array): Observations of the world for an agent act (list): Action for an agent rew (float): Reward for an agent new_obs (np.array): New observations of the world for an agent done (): Done for an agent terminal (boolean): Flag for whether the final episode has been reached. """ self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): """ Reset replay_sample_index to None. """ self.replay_sample_index = None def update(self, agents, steps): """ Update agent networks Args: agents (list): List of MADDPGAgentTrainer objects steps (int): Current training step Returns: (list) Training loss for the agents [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q] """ # Replay buffer is not large enough # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(self.replay_buffer) < 12500: return # Only update every 100 steps if not steps % 100 == 0: return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) hist = self.args.training_history # ************************************************************************************************ ccm_loss = np.array([0.0]) ccm_lambda = np.array([self.args.ccm_lambda]) ccm_switch = np.array([0.0]) # ************************************************************************************************ # Collect replay sample from all agents obs_n = [] obs_h_n = [] obs_next_n = [] obs_next_h_n = [] act_n = [] act_h_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\ replay_buffer.sample_index(index, history=hist) obs_n.append(obs) obs_h_n.append(obs_h) obs_next_n.append(obs_next) obs_next_h_n.append(obs_next_h) act_n.append(act) act_h_n.append(act_h) _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index( index, history=0) obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x for x in obs_h_n] obs_next_h_n = [ [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x for x in obs_next_h_n ] act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x for x in act_h_n] # rew = rew.T[0] # done = done.T[0] # train q network # print(*([x + act_n[i][j] for i,xx in enumerate(obs_n) for j,x in enumerate(xx)])) num_sample = 1 target_q = 0.0 target_q_next = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n)) # TODO: Possible error point target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample # TODO: Possible error point q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n + [target_q])) # Train P network # p_loss = self.p_train(*(obs_n + act_n)) p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n + [ccm_loss] + [ccm_lambda] + [ccm_switch])) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ] def ccm_update(self, agents, steps): """ CCM Update agent networks Args: agents (list): List of MADDPGAgentTrainer objects steps (int): Current training step Returns: (list) Training loss for the agents [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q] """ # Replay buffer is not large enough # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(self.replay_buffer) < 12500: # print("{}/{}".format(len(self.replay_buffer),self.max_replay_buffer_len)) return # Only update every 4 episodes if not steps % (4 * self.args.max_episode_len) == 0: return # Only CCM update for adversaries if not self.role == "adversary": return # batch_ep_size = int(round(self.args.batch_size / self.args.max_episode_len)) batch_ep_size = self.args.ccm_pool self.replay_sample_index, self.ccm_episode_index = self.replay_buffer.\ make_episode_index(batch_ep_size, self.args.max_episode_len, shuffle=not self.args.ccm_on_policy) hist = self.args.training_history # Collect replay sample from all agents obs_n = [] obs_h_n = [] obs_next_n = [] obs_next_h_n = [] act_n = [] act_h_n = [] ccm_act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\ replay_buffer.sample_index(index, history=hist) obs_n.append(obs) obs_h_n.append(obs_h) obs_next_n.append(obs_next) obs_next_h_n.append(obs_next_h) act_n.append(act) act_h_n.append(act_h) ccm_act = [] for ep in self.ccm_episode_index: _, act, _, _, _, _, _, _, _, _ = agents[ i].replay_buffer.sample_index(ep) act = np.array(act) ccm_act.append(act[:, 1] - act[:, 2]) ccm_act_n.append(np.array(ccm_act)) # print("Action CCM: {}".format(ccm.get_score(ccm_act_n[1],ccm_act_n[2],Emax=5,tau=1))) # print("Action CCM: {}".format(ccm_act_n)) ccm_loss = np.array([0.0]) ccm_lambda = np.array([self.args.ccm_lambda]) ccm_switch = np.array([1.0]) if self.agent_index != 1: t_start = time.time() # ccm_scores = [ccm.get_score(ccm_act_n[agent_index], ccm_act_n[i], e_max=5, tau=None) # for i in range(len(ccm_act_n)) if i != agent_index] if self.args.specific_leader_ccm is None and self.args.specific_agent_ccm is None: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[i], e_max=5, tau=1) for i in range(self.n) if i != self.agent_index and agents[i].role == "adversary" ] elif self.args.specific_agent_ccm is None: if self.agent_index == self.args.specific_leader_ccm: ccm_scores = [ ccm.get_score(ccm_act_n[i], ccm_act_n[self.agent_index], e_max=5, tau=1) for i in range(self.n) if i != self.agent_index and agents[i].role == "adversary" ] else: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[i], e_max=5, tau=1) for i in range(self.n) if i == self.args.specific_leader_ccm ] else: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[self.args.specific_leader_ccm], e_max=5, tau=1) for i in range(self.n) if i == self.args.specific_leader_ccm ] # ccm_loss = [1*(x[0]-(x[1]-0.01)) for x in ccm_scores] ccm_loss = [x[0] - np.exp(x[1] - 0.01) for x in ccm_scores] ccm_loss = np.array([np.mean(ccm_loss)]) # print("CCM Loop Time at Trial {}: {}".format(steps,time.time() - t_start)) # Original implementation # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # Modified _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index( index, history=0) obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x for x in obs_h_n] obs_next_h_n = [ [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x for x in obs_next_h_n ] act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x for x in act_h_n] num_sample = 1 target_q = 0.0 target_q_next = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n)) # TODO: Possible error point target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample # TODO: Possible error point q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n + [target_q])) # Train P network # p_loss = self.p_train(*(obs_n + act_n)) p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n + [ccm_loss] + [ccm_lambda] + [ccm_switch])) self.p_update() # self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class I3MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_traj_ph_n = [] intent_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get()) intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get()) self.act_size = act_space_n[0].n self.get_intent, self.i_train, self.i_update, self.i_debug = i_train( scope=self.name, make_obs_ph_n=obs_ph_n, intent_ph_n = intent_ph_n, act_space_n = act_space_n, make_act_traj_ph_n = act_traj_ph_n, make_intent_ph_n =intent_ph_n, i_func = model, i_index = agent_index, output_size = (self.n-1) * self.act_size, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, num_units=args.num_units, reuse = False ) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def intent(self, obs, act_traj): # print(np.array(act_traj).shape) # print(np.array(obs).shape) intent = self.get_intent(*( [[obs]] + [[act_traj]]) )[0] return intent def onpolicy_train_i(self, obs, act_traj, true_act): # print(np.array(act_traj).shape) true_actions = [] for i in range(len(true_act)): true_actions.append([]) for j in range(len(true_act)): if j != i: true_actions[i].append(true_act[j]) obs =[ o for o in np.reshape(obs, (len(obs), 1, -1))] act_traj = [a for a in np.reshape(act_traj, (len(obs),1,len(act_traj[0]),len(act_traj[0][0]),len(act_traj[0][0][0])))] true_act = [t for t in np.reshape(true_actions, (len(obs),1,-1))] i_loss = self.i_train(*(obs + act_traj + true_act)) self.i_update() return i_loss def action(self, obs, intent): return self.act(*([[obs]] +[[intent]]))[0] def experience(self, obs, act, rew, new_obs, act_traj, intent, act_traj_next, intent_next, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, act_traj, intent,act_traj_next, intent_next,float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] act_traj_n = [] intent_n = [] act_traj_next_n = [] intent_next_n = [] index = self.replay_sample_index intent_temp = np.zeros((len(self.replay_sample_index), (self.n-1) * self.act_size)) act_traj_temp = np.zeros((len(self.replay_sample_index), (self.n-1), self.args.timestep, self.act_size)) if self.args.good_i3 == 1 and self.args.adv_i3 == 1: for i in range(self.n): obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj) intent_n.append(intent) act_traj_next_n.append(act_traj_next) intent_next_n.append(intent_next) elif self.args.good_i3 == 1 and self.args.adv_i3 == 0: for i in range(self.n): if i < self.args.num_adversaries: obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj_temp) intent_n.append(intent_temp) act_traj_next_n.append(act_traj_temp) intent_next_n.append(intent_temp) else: obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj) intent_n.append(intent) act_traj_next_n.append(act_traj_next) intent_next_n.append(intent_next) elif self.args.good_i3 == 0 and self.args.adv_i3 == 1: for i in range(self.n): if i < self.args.num_adversaries: obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj) intent_n.append(intent) act_traj_next_n.append(act_traj_next) intent_next_n.append(intent_next) else: obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj_temp) intent_n.append(intent_temp) act_traj_next_n.append(act_traj_temp) intent_next_n.append(intent_temp) else: for i in range(self.n): obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) act_traj_n.append(act_traj_temp) intent_n.append(intent_temp) act_traj_next_n.append(act_traj_temp) intent_next_n.append(intent_temp) obs, act, rew, obs_next, act_traj, intent, act_traj_next, intent_next, done = self.replay_buffer.sample_index(index) num_sample = 1 target_q = 0.0 target_act_next_n =[] if self.args.good_i3 == 1 and self.args.adv_i3 == 1: target_act_next_n = [agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]])) for i in range(self.n)] elif self.args.good_i3 == 1 and self.args.adv_i3 == 0: for i in range(self.n): if i >= self.args.num_adversaries: target_act_next_n.append(agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]]))) else: target_act_next_n.append(agents[i].p_debug['target_act'](obs_next_n[i])) elif self.args.good_i3 == 0 and self.args.adv_i3 == 1: for i in range(self.n): if i < self.args.num_adversaries: target_act_next_n.append(agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]]))) else: target_act_next_n.append(agents[i].p_debug['target_act'](obs_next_n[i])) else: target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n +intent_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n +intent_n +[target_q])) p_loss = self.p_train(*(obs_n + act_n + intent_n)) self.p_update() self.q_update() i_loss = 0 if self.args.onpolicy_i == 0: true_actions = [] for i in range(len(act_traj_next_n)): true_actions.append([]) agent = act_traj_next_n[i] for j in range(len(agent)): true_actions[i].append([]) for k in range(len(agent[j])): a = deepcopy(agent[j][k][-1]) true_actions[i][j] = np.concatenate((true_actions[i][j],a), axis = 0) i_loss = self.i_train(*(obs_n + act_traj_n + true_actions)) self.i_update() return [q_loss, p_loss,i_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]