class SheldonPolicy(Policy): def __init__(self, env, landmark_id, args): super(SheldonPolicy, self).__init__() self.env = env self.landmark_id = landmark_id # dummy replay buffer for collecting experiences self.replay_buffer = ReplayBuffer( args.num_episodes * args.max_episode_len if args.benchmark and args.save_replay else 1e6) def action(self, obs): delta_pos = obs[(4 + self.landmark_id * 2):(4 + self.landmark_id * 2 + 2)] # ignore observation and just act based on keyboard events if self.env.discrete_action_input: # not tested! u = 0 horizontal = abs(delta_pos[0]) > abs(delta_pos[1]) if horizontal and delta_pos[0] < 0: u = 1 # LEFT if horizontal and delta_pos[0] > 0: u = 2 # RIGHT if not horizontal and delta_pos[1] < 0: u = 3 # UP if not horizontal and delta_pos[1] > 0: u = 4 # DOWN else: u = np.zeros(5) # 5-d because of no-move action if delta_pos[0] > 0: u[1] += delta_pos[0] # RIGHT if delta_pos[0] < 0: u[2] += -delta_pos[0] # LEFT if delta_pos[1] > 0: u[3] += delta_pos[1] # UP if delta_pos[1] < 0: u[4] += -delta_pos[1] # DOWN #print(delta_pos, u) #return np.concatenate([u, np.zeros(self.env.world.dim_c)]) return u def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done))
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train1, self.q_update1, self.q_debug1 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_function_idx=1, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.q_train2, self.q_update2, self.q_debug2 = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, q_func=model, q_function_idx=2, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, agent_idx=agent_index, p_func=model, q_func=model, #MLPmodel() optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) a.flush() a.close()
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type, local_q_func=False): self.name = name self.n = 1 self.agent_index = agent_index self.args = args self.u_estimation = args.u_estimation self.constrained = args.constrained self.constraint_type = args.constraint_type self.agent_type = agent_type if self.agent_type == "good": cvar_alpha = args.cvar_alpha_good_agent elif self.agent_type == "adversary": cvar_alpha = args.cvar_alpha_adv_agent obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) # Create all the functions necessary to train the model self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic), optimizer_lamda=tf.train.AdamOptimizer( learning_rate=args.lr_lamda), exp_var_alpha=args.exp_var_alpha, cvar_alpha=cvar_alpha, cvar_beta=args.cvar_beta, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation, constrained=self.constrained, constraint_type=self.constraint_type, agent_type=self.agent_type) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func="maddpg"): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.local_q_func = local_q_func obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) if local_q_func == "ddpg" or local_q_func == "maddpg": # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) if local_q_func == "dqn": self.act, self.p_train, self.p_update, self.p_debug = dqn_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, env, landmark_id, args): super(SheldonPolicy, self).__init__() self.env = env self.landmark_id = landmark_id # dummy replay buffer for collecting experiences self.replay_buffer = ReplayBuffer( args.num_episodes * args.max_episode_len if args.benchmark and args.save_replay else 1e6)
def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_traj_ph_n = [] intent_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get()) intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get()) self.act_size = act_space_n[0].n self.get_intent, self.i_train, self.i_update, self.i_debug = i_train( scope=self.name, make_obs_ph_n=obs_ph_n, intent_ph_n = intent_ph_n, act_space_n = act_space_n, make_act_traj_ph_n = act_traj_ph_n, make_intent_ph_n =intent_ph_n, i_func = model, i_index = agent_index, output_size = (self.n-1) * self.act_size, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, num_units=args.num_units, reuse = False ) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions, obs_ph_n[0].shape[1]) #self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.max_replay_buffer_len = args.batch_size # I mean this is how it should be. This is what we're actually doing... self.replay_sample_index = None
def __init__(self, name, before_com_model, channel, after_com_model, critic_mlp_model, obs_shape_n, act_space_n, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, before_com_func=before_com_model, channel=channel, after_com_func=after_com_model, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, beta=args.beta, ibmac_com=args.ibmac_com, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) # self.max_replay_buffer_len = 50 * args.max_episode_len self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.message_1_for_record = []
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, u_estimation=False): print('in here') self.name = name self.n = 1 #len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) self.u_estimation = u_estimation # Create all the functions necessary to train the model l = q_train(scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation) if self.u_estimation: self.q_train, self.q_update, self.u_update, self.q_debug = l else: self.q_train, self.q_update, self.q_debug = l self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, n_agents, name, model, state_shape, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] state_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), lstm=args.actor_lstm or args.critic_lstm).get()) state_ph_n.append(U.BatchInput(state_shape, name="state" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( n_agents=n_agents, scope=self.name, make_state_ph_n=state_ph_n, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, discrete_action=args.discrete_action, target_update_tau=args.target_update_tau, use_global_state=args.use_global_state, share_weights=args.share_weights ) self.act, self.act_test, self.p_train, self.p_update, self.p_debug = p_train( n_agents = n_agents, scope=self.name, make_state_ph_n=state_ph_n, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, discrete_action=args.discrete_action, target_update_tau=args.target_update_tau, use_global_state=args.use_global_state, share_weights=args.share_weights ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model_value, model_policy, obs_shape_n, act_space_n, agent_index, args, hparams, summary_writer=None, local_q_func=False, rngseed=None): self.name = name self.rngseed = rngseed self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.hparams = hparams obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput( obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # train critic self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # train policy self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model_policy, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed) try: if hparams['test_saving']: self.max_replay_buffer_len = 100 except KeyError: self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len self.replay_sample_index = None self.summary_writer = summary_writer
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, reuse=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=reuse) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=reuse, deterministic=args.benchmark and args.deterministic) # Create experience buffer self.replay_buffer = ReplayBuffer( args.num_episodes * args.max_episode_len if args.benchmark and args.save_replay else 1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.counter = 0 obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)+"_ag"+str(agent_index)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5 ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, p_index=0, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5 ) # Create experience buffer self.replay_buffer = [ReplayBuffer(1e6) for i in range(self.n)] self.max_replay_buffer_len = args.batch_size * args.max_episode_len
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): # 是否用ddpg训练 self.name = name self.n = len(obs_shape_n) # 总的agent个数 self.agent_index = agent_index # 当前是几号agent self.args = args # cmd传入的训练参数,交互用 obs_ph_n = [] for i in range(self.n): # 用于一批环境数据放入的占位符集合,收集所有agent的observations, # 依据他们observation的shape创造不同大小的批量占位符集合 obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # 训练节点,更新target网络,字典得到对应输出的q值与target-q值(已经被session激活) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # 得到act,训练策略网络,策略网络的target网络更新,字典给出p值和target策略网络的输出动作值 self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_mems = [] for i in range(args.num_groups): # assumes agents have same observation shape obs_ph_mems.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_mems, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, num_groups=args.num_groups) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_mems, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, num_groups=args.num_groups) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name # name of the agent self.n = len(obs_shape_n) # number of agents self.agent_index = agent_index # Index of the specific agent self.args = args # Settings of hyper-parameters obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype. # [Create all the functions necessary to train the model] # train: U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) # update_target_q: make_update_exp(q_func_vars, target_q_func_vars) # q_values: U.function(obs_ph_n + act_ph_n, q) # target_q_values: U.function(obs_ph_n + act_ph_n, target_q) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, # String: "agent_1" or "agent_2" or ... make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, # action_space. q_index=agent_index, # Index of the specific agent. q_func=model, # Defined model. optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定 grad_norm_clipping=0.5, # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值 local_q_func=local_q_func, num_units=args.num_units # Hidden layers 隐藏节点数 ) # act: U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # train: U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) # update_target_p: make_update_exp(p_func_vars, target_p_func_vars) # p_values: U.function([obs_ph_n[p_index]], p) # target_act: U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) # 16 self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) #obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), dtype=tf.uint8).get()) #should we specify uint8 instead of default float? # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, # multi-layer perceptron optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, # maddpg or ddpg num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, env, name, model, CNN_model, obs_shape_n, obs_map_shape_n,act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_map_ph_n=[] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) obs_map_ph_n.append(U.BatchInput(obs_map_shape_n[i], name="observation_map"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n ) self.act, self.p_train, self.vf_t, self.p_update, self.vf_u, self.p_debug = p_train( scope=self.name, env = env, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, vf_func=model, shana = GMMPolicy, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.batch_size=args.batch_size
def __init__(self, name, critic_model, policy_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = 4 self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope = self.name, make_obs_ph_n = obs_ph_n, act_space_n = act_space_n, q_index = agent_index, q_func = critic_model, optimizer = tf.train.AdamOptimizer(learning_rate=args['lr']), grad_norm_clipping = 0.5, local_q_func = local_q_func, num_units = args['num_units'] ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope = self.name, make_obs_ph_n = obs_ph_n, act_space_n = act_space_n, p_index = agent_index, p_func = policy_model, q_func = critic_model, optimizer = tf.train.AdamOptimizer(learning_rate=args['lr']), grad_norm_clipping = 0.5, local_q_func = local_q_func, num_units = args['num_units'] ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args['batch_size'] * args['max_episode_len'] self.replay_sample_index = None
def __init__(self, name, learning_rate, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.learning_rate = learning_rate self.n = len(obs_shape_n) self.agent_index = agent_index self.obs_size = obs_shape_n[agent_index] self.joint_obs_size = np.sum(obs_shape_n) self.act_size = act_space_n[agent_index].n self.act_pdtype_n = [ make_pdtype(act_space) for act_space in act_space_n ] self.joint_act_size = 0 for i_act in act_space_n: self.joint_act_size += i_act.n self.args = args self.actor = Actor(self.obs_size, self.act_size) self.actor_target = Actor(self.obs_size, self.act_size) self.critic = self.build_critic() self.critic_target = self.build_critic() update_target(self.actor, self.actor_target, 0) update_target(self.critic, self.critic_target, 0) #self.actor, self.critic = self.build_model() #self.actor_target, self.critic_target = self.build_model() self.actor_optimizer = self.build_actor_optimizer() # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None gpu = -1 self.device = "/gpu:{}".format(gpu) if gpu >= 0 else "/cpu:0"
def __init__(self, obs_shape_n, act_info_n, agent_index, args, local_q_func=False): self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.grad_norm_clipping = 0.5 # Networks self.device = args.device self.vf = Critic( obs_shape_n=obs_shape_n, act_info_n=act_info_n, num_units=args.num_units, q_index=agent_index, local_q_func=local_q_func, ).to(self.device) act_dim, self.pdtype = act_info_n[agent_index] self.pi = MLP(obs_shape_n[agent_index], act_dim, num_units=args.num_units).to(self.device) # Initialize init_params(self.vf) init_params(self.pi) # Target Networks self.pi_targ = deepcopy(self.pi) for p in self.pi_targ.parameters(): p.requires_grad = False self.vf_targ = deepcopy(self.vf) for p in self.vf_targ.parameters(): p.requires_grad = False # Optimizer self.pi_optim = Adam(self.pi.parameters(), lr=args.lr) self.vf_optim = Adam(self.vf.parameters(), lr=args.lr) # Create Replay Buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, state_shape, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = 1 self.agent_index = agent_index self.args = args obs_ph_n = [] obs_ph_n.append(U.BatchInput(state_shape, name="observation"+str(0)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_buffer_size = args.min_buffer_size self.replay_sample_index = None
def __init__(self, pos_x, pos_y, workcap=40, sense_r=1, global_view=True): self.pos = [pos_x, pos_y] self.old_pos = self.pos self.workcap = 40 self.worktime = 0 # TODO: Establish relation between sense_r and sense_p self.sense_r = sense_r self.sense_p = [(1, 0), (-1, 0), (0, 1), (0, -1)] #下,上,右,左 self.global_view = global_view self.local_view = None self.global_view = None self.island = True # island=False特指悬停, island=True特指降落充电 self.isCharging = False self.actions = [(1, 0), (-1, 0), (0, 1), (0, -1)] self.staytime = 0 self.experience = ReplayBuffer(1e6)
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, role="", local_q_func=False): """ Args: name (str): Name of the agent model (function): MLP Neural Network model for the agent. obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents agent_index (int): Agent index number args (argparse.Namespace): Parsed commandline arguments object role (str): Role of the agent i.e. adversary local_q_func (boolean): Flag for using local q function """ super(MADDPGAgentTrainerCCM, self).__init__() self.name = name self.role = role self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_history_ph_n = [] obs_history_ph_n = [] hist = self.args.training_history obs_history_n = [(hist * x[0], ) for x in obs_shape_n] act_history_n = [(hist * act.n, ) for act in act_space_n] # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n] # for act_space in act_space_n: # act_space.n = act_space.n*3 # if act_history_n[0].n != 15: # print("Line 158") for i in range(self.n): obs_ph_n.append( tf_util.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_history_ph_n.append( tf_util.BatchInput(obs_history_n[i], name="observationhistory" + str(i)).get()) act_history_ph_n.append( tf_util.BatchInput(act_history_n[i], name="actionhistory" + str(i)).get()) # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len self.replay_sample_index = None
class MADDPGAgentTrainerCCM(AgentTrainer): """ Agent Trainer using MADDPG Algorithm and CCM """ def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, role="", local_q_func=False): """ Args: name (str): Name of the agent model (function): MLP Neural Network model for the agent. obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents agent_index (int): Agent index number args (argparse.Namespace): Parsed commandline arguments object role (str): Role of the agent i.e. adversary local_q_func (boolean): Flag for using local q function """ super(MADDPGAgentTrainerCCM, self).__init__() self.name = name self.role = role self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_history_ph_n = [] obs_history_ph_n = [] hist = self.args.training_history obs_history_n = [(hist * x[0], ) for x in obs_shape_n] act_history_n = [(hist * act.n, ) for act in act_space_n] # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n] # for act_space in act_space_n: # act_space.n = act_space.n*3 # if act_history_n[0].n != 15: # print("Line 158") for i in range(self.n): obs_ph_n.append( tf_util.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_history_ph_n.append( tf_util.BatchInput(obs_history_n[i], name="observationhistory" + str(i)).get()) act_history_ph_n.append( tf_util.BatchInput(act_history_n[i], name="actionhistory" + str(i)).get()) # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): """ Retrieves action for agent from the P network given the observations Args: obs (np.array): Observations of the world for an agent Returns: Action for an agent """ hist = self.args.training_history if len(self.replay_buffer) > (hist + 1): _, _, _, _, _, obs_h, _, _, _, _ = self.replay_buffer.sample_index( [len(self.replay_buffer)], hist) if len(obs_h) > 0: obs_h = obs_h[0] # obs = np.concatenate((obs,ob[0]),0) else: obs_h = np.array((hist) * list(obs)) return self.act(obs[None], obs_h[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): """ Store transition in the replay buffer. Args: obs (np.array): Observations of the world for an agent act (list): Action for an agent rew (float): Reward for an agent new_obs (np.array): New observations of the world for an agent done (): Done for an agent terminal (boolean): Flag for whether the final episode has been reached. """ self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): """ Reset replay_sample_index to None. """ self.replay_sample_index = None def update(self, agents, steps): """ Update agent networks Args: agents (list): List of MADDPGAgentTrainer objects steps (int): Current training step Returns: (list) Training loss for the agents [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q] """ # Replay buffer is not large enough # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(self.replay_buffer) < 12500: return # Only update every 100 steps if not steps % 100 == 0: return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) hist = self.args.training_history # ************************************************************************************************ ccm_loss = np.array([0.0]) ccm_lambda = np.array([self.args.ccm_lambda]) ccm_switch = np.array([0.0]) # ************************************************************************************************ # Collect replay sample from all agents obs_n = [] obs_h_n = [] obs_next_n = [] obs_next_h_n = [] act_n = [] act_h_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\ replay_buffer.sample_index(index, history=hist) obs_n.append(obs) obs_h_n.append(obs_h) obs_next_n.append(obs_next) obs_next_h_n.append(obs_next_h) act_n.append(act) act_h_n.append(act_h) _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index( index, history=0) obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x for x in obs_h_n] obs_next_h_n = [ [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x for x in obs_next_h_n ] act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x for x in act_h_n] # rew = rew.T[0] # done = done.T[0] # train q network # print(*([x + act_n[i][j] for i,xx in enumerate(obs_n) for j,x in enumerate(xx)])) num_sample = 1 target_q = 0.0 target_q_next = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n)) # TODO: Possible error point target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample # TODO: Possible error point q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n + [target_q])) # Train P network # p_loss = self.p_train(*(obs_n + act_n)) p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n + [ccm_loss] + [ccm_lambda] + [ccm_switch])) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ] def ccm_update(self, agents, steps): """ CCM Update agent networks Args: agents (list): List of MADDPGAgentTrainer objects steps (int): Current training step Returns: (list) Training loss for the agents [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q] """ # Replay buffer is not large enough # if len(self.replay_buffer) < self.max_replay_buffer_len: if len(self.replay_buffer) < 12500: # print("{}/{}".format(len(self.replay_buffer),self.max_replay_buffer_len)) return # Only update every 4 episodes if not steps % (4 * self.args.max_episode_len) == 0: return # Only CCM update for adversaries if not self.role == "adversary": return # batch_ep_size = int(round(self.args.batch_size / self.args.max_episode_len)) batch_ep_size = self.args.ccm_pool self.replay_sample_index, self.ccm_episode_index = self.replay_buffer.\ make_episode_index(batch_ep_size, self.args.max_episode_len, shuffle=not self.args.ccm_on_policy) hist = self.args.training_history # Collect replay sample from all agents obs_n = [] obs_h_n = [] obs_next_n = [] obs_next_h_n = [] act_n = [] act_h_n = [] ccm_act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\ replay_buffer.sample_index(index, history=hist) obs_n.append(obs) obs_h_n.append(obs_h) obs_next_n.append(obs_next) obs_next_h_n.append(obs_next_h) act_n.append(act) act_h_n.append(act_h) ccm_act = [] for ep in self.ccm_episode_index: _, act, _, _, _, _, _, _, _, _ = agents[ i].replay_buffer.sample_index(ep) act = np.array(act) ccm_act.append(act[:, 1] - act[:, 2]) ccm_act_n.append(np.array(ccm_act)) # print("Action CCM: {}".format(ccm.get_score(ccm_act_n[1],ccm_act_n[2],Emax=5,tau=1))) # print("Action CCM: {}".format(ccm_act_n)) ccm_loss = np.array([0.0]) ccm_lambda = np.array([self.args.ccm_lambda]) ccm_switch = np.array([1.0]) if self.agent_index != 1: t_start = time.time() # ccm_scores = [ccm.get_score(ccm_act_n[agent_index], ccm_act_n[i], e_max=5, tau=None) # for i in range(len(ccm_act_n)) if i != agent_index] if self.args.specific_leader_ccm is None and self.args.specific_agent_ccm is None: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[i], e_max=5, tau=1) for i in range(self.n) if i != self.agent_index and agents[i].role == "adversary" ] elif self.args.specific_agent_ccm is None: if self.agent_index == self.args.specific_leader_ccm: ccm_scores = [ ccm.get_score(ccm_act_n[i], ccm_act_n[self.agent_index], e_max=5, tau=1) for i in range(self.n) if i != self.agent_index and agents[i].role == "adversary" ] else: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[i], e_max=5, tau=1) for i in range(self.n) if i == self.args.specific_leader_ccm ] else: ccm_scores = [ ccm.get_score(ccm_act_n[self.agent_index], ccm_act_n[self.args.specific_leader_ccm], e_max=5, tau=1) for i in range(self.n) if i == self.args.specific_leader_ccm ] # ccm_loss = [1*(x[0]-(x[1]-0.01)) for x in ccm_scores] ccm_loss = [x[0] - np.exp(x[1] - 0.01) for x in ccm_scores] ccm_loss = np.array([np.mean(ccm_loss)]) # print("CCM Loop Time at Trial {}: {}".format(steps,time.time() - t_start)) # Original implementation # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # Modified _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index( index, history=0) obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x for x in obs_h_n] obs_next_h_n = [ [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x for x in obs_next_h_n ] act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x for x in act_h_n] num_sample = 1 target_q = 0.0 target_q_next = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n)) # TODO: Possible error point target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample # TODO: Possible error point q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n + [target_q])) # Train P network # p_loss = self.p_train(*(obs_n + act_n)) p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n + [ccm_loss] + [ccm_lambda] + [ccm_switch])) self.p_update() # self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, actor_lr=None, critic_lr=None, gamma=None, num_units=None, rb_size=None, batch_size=None, max_episode_len=None, clip_norm=0.5, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # training parameters self.actor_lr = actor_lr if actor_lr else args.lr self.critic_lr = critic_lr if critic_lr else args.lr self.gamma = gamma if gamma else args.gamma self.num_units = num_units if num_units else args.num_units self.rb_size = rb_size if rb_size else args.rb_size self.batch_size = batch_size if batch_size else args.batch_size self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len self.clip_norm = clip_norm # TODO: remove after testing import models.config as Config assert actor_lr == Config.maddpg_train_args['actor_lr'] assert critic_lr == Config.maddpg_train_args['critic_lr'] assert gamma == Config.maddpg_train_args['gamma'] assert num_units == Config.maddpg_train_args['num_hidden'] assert rb_size == Config.maddpg_train_args['rb_size'] assert batch_size == Config.maddpg_train_args['batch_size'] assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps'] assert clip_norm == Config.maddpg_train_args['clip_norm'] obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(self.rb_size) self.max_replay_buffer_len = self.batch_size * self.max_episode_len self.replay_sample_index = None self.loss_names = [ 'q_loss', 'p_loss', 'mean_target_q', 'mean_rew', 'mean_target_q_next', 'std_target_q' ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, actor_lr=None, critic_lr=None, gamma=None, num_units=None, rb_size=None, batch_size=None, max_episode_len=None, clip_norm=0.5, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # training parameters self.actor_lr = actor_lr if actor_lr else args.lr self.critic_lr = critic_lr if critic_lr else args.lr self.gamma = gamma if gamma else args.gamma self.num_units = num_units if num_units else args.num_units self.rb_size = rb_size if rb_size else args.rb_size self.batch_size = batch_size if batch_size else args.batch_size self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len self.clip_norm = clip_norm # TODO: remove after testing import models.config as Config assert actor_lr == Config.maddpg_train_args['actor_lr'] assert critic_lr == Config.maddpg_train_args['critic_lr'] assert gamma == Config.maddpg_train_args['gamma'] assert num_units == Config.maddpg_train_args['num_hidden'] assert rb_size == Config.maddpg_train_args['rb_size'] assert batch_size == Config.maddpg_train_args['batch_size'] assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps'] assert clip_norm == Config.maddpg_train_args['clip_norm'] obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(self.rb_size) self.max_replay_buffer_len = self.batch_size * self.max_episode_len self.replay_sample_index = None self.loss_names = [ 'q_loss', 'p_loss', 'mean_target_q', 'mean_rew', 'mean_target_q_next', 'std_target_q' ] def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 act_space = act.shape[-1] target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] # flatten multi agent actions and observations act_serial_vals = self.q_debug['act_serial_values']( *(target_act_next_n)) obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_next_n)) assert len(act_serial_vals) == self.batch_size assert len(obs_serial_vals) == self.batch_size # compute L2 normalized partial derivatives of target Q function wrt actions # NOTE: this is done one sample at a time to prevent tf.gradient from summing over all target q values grad_norm_value = [ self.q_debug['grad_norm_value'](*([[obs_serial_vals[j]]] + [[act_serial_vals[j]]])) for j in range(self.batch_size) ] assert len(grad_norm_value) == self.batch_size # scale the raw gradients by alpha # TODO: set alpha during init or compute as function of policy or loss perturb = np.array(grad_norm_value) * 0.01 # update leader actions using gradients for b in range(self.batch_size): # find all the leaders wrt current agent (agent_index) leading_agents = [ [1.0] * act_space if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2] else [0.0] * act_space for k in range(self.n) ] # filter perturbations to only apply for leading agents # scale by L2 norm of original actions to prevent the perturb from overwhelming action epsilon = perturb[b].flatten() * np.array( leading_agents).flatten() * np.linalg.norm( act_serial_vals[b], 2) act_serial_vals[b] += epsilon # target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q_next = self.q_debug['target_q_values']( *([obs_serial_vals] + [act_serial_vals])) target_q += rew + self.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # get current actions and observations flattened act_serial_vals = self.q_debug['act_serial_values'](*(act_n)) obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_n)) # compute L2 normalized partial derivatives of Q function wrt actions grad_norm_value = [ self.p_debug['grad_norm_value'](*([[obs_serial_vals[j]]] + [[act_serial_vals[j]]])) for j in range(self.batch_size) ] assert len(grad_norm_value) == self.batch_size # scale the raw gradients by alpha perturb = np.array(grad_norm_value) * 0.01 # update leader actions using these perturbations for b in range(self.batch_size): # find all the leaders wrt current agent (agent_index) leading_agents = [ [1.0] * act_space if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2] else [0.0] * act_space for k in range(self.n) ] # filter perturbations to only apply for leading agents epsilon = perturb[b].flatten() * np.array(leading_agents).flatten( ) * np.linalg.norm(act_serial_vals[b], 2) epsilon_n = [ epsilon[k * act_space:(k * act_space) + act_space] for k in range(self.n) ] # update each agent action for current batch sample "b" for k in range(self.n): act_n[k][b] += epsilon_n[k] # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
def train(arglist, PID=None, lock=None): start_time = time.time() # global replay_buffer with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agents networks obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] ####changed by yuan li num_adversaries = copy.deepcopy(env.num_adversaries) arglist.num_adversaries = copy.deepcopy(num_adversaries) if comm_rank != 0 and comm_rank != 1: req = None wait_flag = False actors = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() #var_list = [var for var in tf.trainable_variables()] #加载模型 var_list_n = [] for actor in actors: var_list_n.extend(actor.get_variable_list()) saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables( env) obs_n = env.reset() step = 0 episode_step = 0 sample_number = 0 t_start = time.time() updata_time = 0 print('Starting iterations...') invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 while True: if not wait_flag: #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11) req = comm.irecv(350000, source=0, tag=11) wait_flag = True else: data_recv = req.test() if data_recv[0]: wait_flag = False if data_recv[1] == 'finish': #finish = True comm.send('finish', dest=1, tag=11) break else: update_start = time.time() i = 0 j = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: var.load(data_recv[1][j], sess) j += 1 i += 1 #for var in var_list: # var.load(data_recv[1][i], sess) # i += 1 #print("111111111111111111111111,load param") #for i, actor in enumerate(actors): # actor.load_weights(data_recv[1][i], sess) update_end = time.time() #print("step:{}, rank0_update_end_time:{}".format(step, update_end)) updata_time += (update_end - update_start) step += 1 else: wait_flag = True # get action action_n = [ agent.action(obs) for agent, obs in zip(actors, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 # changed by liyuan done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) ###liyuan: compute the arverage win rate if green_leave_screen(env) or adversary_all_die( env) or adversary_leave_screen(env): terminal = True if adversary_all_die(env): green_win += 1 if green_leave_screen(env): invalid_train += 1 green_leave += 1 if adversary_leave_screen(env): red_leave += 1 if episode_step >= arglist.max_episode_len: for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 50 if adversary_all_die(env): for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 100 if done: red_win = red_win + 1 for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] += 200 rew_n[i] += ( arglist.max_episode_len - episode_step) / arglist.max_episode_len #send data data = [obs_n, action_n, rew_n, new_obs_n, done_n] comm.send(data, dest=1, tag=11) sample_number += 1 #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): if red_win >= 0.8 * arglist.save_rate: temp_dir = arglist.save_dir + "_" + str( len(episode_rewards)) + "_" + str( red_win) + "_{}".format(PID) U.save_state(temp_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), round(time.time() - t_start, 3))) else: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) print( "Rank {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}" .format(comm_rank, red_win, green_win, red_leave, green_leave)) middle_time = time.time() print( "sample_number:{}, train_step:{}, update_time:{}, total_time:{}" .format(sample_number, step, updata_time, middle_time - start_time)) mydata = [] mydata.append(str(len(episode_rewards))) mydata.append( str( np.mean(episode_rewards[-arglist. save_rate:]))) mydata.append( str( np.mean(agent_rewards[0] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[1] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[2] [-arglist.save_rate:]))) mydata.append(str(red_win)) mydata.append( str(round(time.time() - t_start, 3))) out = open('1mydata_{}.csv'.format(comm_rank), 'a', newline='') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(mydata) if len(episode_rewards) > 3000: U.save_state(arglist.save_dir, saver=saver) invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) end_time = time.time() print("rank{}_time:{}".format(comm_rank, end_time - start_time)) print("rank{}_update_time:{}".format(comm_rank, updata_time)) print("rank{}_step:{}".format(comm_rank, step)) if comm_rank == 1: replay_buffer = ReplayBuffer(1e6) wait_flag_1 = False wait_flag_2 = False wait_flag_3 = False req1 = None req2 = None req3 = None sample = 0 step = 0 req_list = [] while True: if not wait_flag_1 or not wait_flag_2 or not wait_flag_3: if not wait_flag_1: req1 = comm.irecv(source=2, tag=11) wait_flag_1 = True if not wait_flag_2: req2 = comm.irecv(source=3, tag=11) wait_flag_2 = True if not wait_flag_3: req3 = comm.irecv(source=4, tag=11) wait_flag_3 = True else: data_recv_1 = req1.test() data_recv_2 = req2.test() data_recv_3 = req3.test() if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]: if data_recv_1[0]: wait_flag_1 = False if data_recv_1[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_2[0]: wait_flag_2 = False if data_recv_2[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_3[0]: wait_flag_3 = False if data_recv_3[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 ''' #计算接收100个样本然后发送样本用的时间 if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len: start = time.time() replay_sample_index = replay_buffer.make_index(arglist.batch_size) send_data = replay_buffer.sample_index(replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11) sample = 0 step += 1 end = time.time() print("rank1 send sample time:", end-start) ''' else: wait_flag_1 = True wait_flag_2 = True wait_flag_3 = True if (sample // 100 > 0) and len( replay_buffer ) >= arglist.batch_size * arglist.max_episode_len: replay_sample_index = replay_buffer.make_index( arglist.batch_size) send_data = replay_buffer.sample_index( replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=0, tag=11) sample = 0 step += 1 end_time = time.time() print("rank1_time:", end_time - start_time) print("rank1_step", step) if comm_rank == 0: extract_time = 0 step = 0 learners = get_agents(env, num_adversaries, obs_shape_n, arglist) var_list_n = [] for learner in learners: var_list_n.extend(learner.get_variable_list()) U.initialize() #var_list = [var for var in tf.trainable_variables()] # 加载模型 saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) while True: if step >= STEP: for i in range(comm_size - 2): comm.send('finish', dest=(i + 2), tag=11) break else: start = time.time() data_recv = comm.recv(source=1, tag=11) for i, agent in enumerate(learners): agent.update(learners, data_recv) #dict_list = [] param = [] extract_start = time.time() i = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: param.append(sess.run(var)) i += 1 #print("2222222222222222 load weights") #for var in var_list: # param.append(sess.run(var)) extract_end = time.time() extract_time += (extract_end - extract_start) for i in range(comm_size - 2): comm.send(param, dest=(i + 2), tag=11) #print("222222222222222222222222,send param") step += 1 end = time.time() #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start)) end_time = time.time() print("rank0_time:", end_time - start_time) print("rank0_extract_time:", extract_time) print("rank0_step:", step)
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, safety_layer=None, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.safety_layer = safety_layer obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs, c=None, env=None): action = self.act(obs[None])[0] if_call = False return action, if_call def action_real(self, obs, c=None, env=None): # get action from DDPG action = self.act(obs[None])[0] action_real = action if_call = False dist = np.sqrt( np.sum( np.square(env.agents[0].state.p_pos - env.world.landmarks[-1].state.p_pos))) # call for the safety_layer if self.safety_layer and c is not None and env is not None and dist > 1.5: # judge the collision in future 10 steps collision_flag = False env_future = copy.deepcopy(env) obs_future = copy.deepcopy(obs) trajectory = np.zeros([4, self.safety_layer.UAV_config.N + 1]) trajectory[0, 0] = obs_future[2] trajectory[1, 0] = obs_future[3] trajectory[2, 0] = obs_future[4] trajectory[3, 0] = obs_future[5] for i in range(self.safety_layer.UAV_config.N): action_future = [self.act(obs_future[None])[0]] # environment step new_obs_n, rew_n, done_n, info_n = env_future.step( action_future) is_any_collision = [] for agent in env_future.agents: temp = False for _, landmark in enumerate( env_future.world.landmarks[0:-1]): dist = np.sqrt(np.sum(np.square(agent.state.p_pos - landmark.state.p_pos))) \ - (agent.size + landmark.size) if dist <= 0: temp = True is_any_collision.append(temp) if is_any_collision[0]: collision_flag = True done_future = all(done_n) if done_future: break obs_future = new_obs_n[0] trajectory[0, i + 1] = obs_future[2] trajectory[1, i + 1] = obs_future[3] trajectory[2, i + 1] = obs_future[4] trajectory[3, i + 1] = obs_future[5] if not collision_flag: return action_real, action, if_call action, if_call = self.safety_layer.get_safe_action( obs, action, trajectory) return action_real, action, if_call def set_safety_layer(self, safety_layer): self.safety_layer = safety_layer def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len( self.replay_buffer ) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index( self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[ i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [ agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n) ] target_q_next = self.q_debug['target_q_values']( *(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [ q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q) ]
class MADDPGAgentTrainer(AgentTrainer): def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None def action(self, obs): return self.act(obs[None])[0] def experience(self, obs, act, rew, new_obs, done, terminal): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, float(done)) def preupdate(self): self.replay_sample_index = None def update(self, agents, t): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) # collect replay sample from all agents obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for i in range(self.n): obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) obs_n.append(obs) obs_next_n.append(obs_next) act_n.append(act) obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # train q network num_sample = 1 target_q = 0.0 for i in range(num_sample): target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) target_q += rew + self.args.gamma * (1.0 - done) * target_q_next target_q /= num_sample q_loss = self.q_train(*(obs_n + act_n + [target_q])) # train p network p_loss = self.p_train(*(obs_n + act_n)) self.p_update() self.q_update() return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]