def __init__(self, env, num_constraints, model, obs_shape, act_space): self.name = "safe-layer" self._env = env self.num_constraints = num_constraints # = num_landmarks - 1 because the last landmark is the target self.max_episode_length = 300 self.batch_size = 1024 self.lr = 0.1 self.steps_per_epoch = 6000 self.epochs = 250 self.evaluation_steps = 1500 self.replay_buffer_size = 1000000 self.num_units = 10 self._train_global_step = 0 self.max_replay_buffer = self.batch_size * self.max_episode_length # 76800 self.replay_buffer = ReplayBuffer(self.replay_buffer_size) # 1e6 obs_ph = U.BatchInput(obs_shape, name="observation").get() c_ph = [ U.BatchInput([1], name="constraints_value" + str(_)).get() for _ in range(self.num_constraints) ] self.c_next_train, self.c_next_values, self.g_next_values = c_next( scope=self.name, make_obs_ph=obs_ph, act_space=act_space, c_ph=c_ph, num_constraints=self.num_constraints, c_next_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.lr), grad_norm_clipping=0.5, num_units=self.num_units, )
def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_traj_ph_n = [] intent_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get()) intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get()) self.act_size = act_space_n[0].n self.get_intent, self.i_train, self.i_update, self.i_debug = i_train( scope=self.name, make_obs_ph_n=obs_ph_n, intent_ph_n = intent_ph_n, act_space_n = act_space_n, make_act_traj_ph_n = act_traj_ph_n, make_intent_ph_n =intent_ph_n, i_func = model, i_index = agent_index, output_size = (self.n-1) * self.act_size, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, num_units=args.num_units, reuse = False ) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_intent_ph_n = intent_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, CNN_model, obs_shape_n, obs_map_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_map_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_map_ph_n.append( U.BatchInput(obs_map_shape_n[i], name="observation_map" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, shared_CNN=CNN_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, make_obs_map_ph_n=obs_map_ph_n) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len // 10 self.batch_size = args.batch_size self.replay_sample_index = None
def __init__(self, n_agents, name, model, state_shape, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] state_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), lstm=args.actor_lstm or args.critic_lstm).get()) state_ph_n.append(U.BatchInput(state_shape, name="state" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( n_agents=n_agents, scope=self.name, make_state_ph_n=state_ph_n, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, discrete_action=args.discrete_action, target_update_tau=args.target_update_tau, use_global_state=args.use_global_state, share_weights=args.share_weights ) self.act, self.act_test, self.p_train, self.p_update, self.p_debug = p_train( n_agents = n_agents, scope=self.name, make_state_ph_n=state_ph_n, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, discrete_action=args.discrete_action, target_update_tau=args.target_update_tau, use_global_state=args.use_global_state, share_weights=args.share_weights ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, actor_model, critic_mlp_model, obs_shape_n, act_space_n, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.args = args obs_ph_n = [] messages_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get()) messages_ph_n.append( U.BatchInput((args.dim_message, ), name="message_" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, make_meesages_ph_n=messages_ph_n, act_space_n=act_space_n, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, make_meesages_ph_n=messages_ph_n, act_space_n=act_space_n, p_func=actor_model, q_func=critic_mlp_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, beta=args.beta, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) # self.max_replay_buffer_len = 50 * args.max_episode_len self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, p_model, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args pMA_model = p_model(args.num_adversaries, 1, agent_index) obs_ph_n = [] memory_ph_in = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) if i < arglist.num_adversaries: memory_ph_in.append(U.BatchInput((args.memUnits, ), name="memory_state"+str(i)).get()) if self.agent_index == 0: reuse = False else: reuse = True # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = qMA_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.critic_lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.critic_units, reuse=reuse ) self.act, self.memory_out, self.p_train, self.p_update, self.p_debug = pMA_train( scope=self.name, make_obs_ph_n=obs_ph_n, make_memory_ph_n=memory_ph_in, act_space_n=act_space_n, p_index=agent_index, p_func=pMA_model.adv_model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.actor_lr), grad_norm_clipping=0.5, local_q_func=local_q_func, critic_units=args.critic_units, reuse=reuse ) # Create experience buffer self.replay_buffer = MAReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.counter = 0 obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)+"_ag"+str(agent_index)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5 ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n], act_space_n=act_space_n, p_index=0, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5 ) # Create experience buffer self.replay_buffer = [ReplayBuffer(1e6) for i in range(self.n)] self.max_replay_buffer_len = args.batch_size * args.max_episode_len
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type, local_q_func=False): self.name = name self.n = 1 self.agent_index = agent_index self.args = args self.u_estimation = args.u_estimation self.constrained = args.constrained self.constraint_type = args.constraint_type self.agent_type = agent_type if self.agent_type == "good": cvar_alpha = args.cvar_alpha_good_agent elif self.agent_type == "adversary": cvar_alpha = args.cvar_alpha_adv_agent obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) # Create all the functions necessary to train the model self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic), optimizer_lamda=tf.train.AdamOptimizer( learning_rate=args.lr_lamda), exp_var_alpha=args.exp_var_alpha, cvar_alpha=cvar_alpha, cvar_beta=args.cvar_beta, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation, constrained=self.constrained, constraint_type=self.constraint_type, agent_type=self.agent_type) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type="good", local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) if(agent_type == "good"): self.mic = float(args.good_mic) else: self.mic = float(args.adv_mic) print("MIC for ", agent_type, " agent is ", self.mic) self.agent_type = agent_type # make a multivariate for each agent. self.multivariate_mean = None self.multivariate_cov = None self.marginal_aprox_lr = 1e-2 self.action_history = [] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), mut_inf_coef=self.mic , grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), mut_inf_coef=self.mic , grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, # reuse = tf.compat.v1.AUTO_REUSE, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions, obs_ph_n[0].shape[1]) #self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.max_replay_buffer_len = args.batch_size # I mean this is how it should be. This is what we're actually doing... self.replay_sample_index = None
def __init__(self, name, mlp_model, lstm_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # LSTM placeholders p_res = 7 q_res = 1 # set up initial states self.q_c, self.q_h = create_init_state(num_batches=1, len_sequence=args.num_units) self.p_c, self.p_h = create_init_state(num_batches=1, len_sequence=args.num_units) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_LSTM_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=lstm_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_LSTM_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=lstm_model, q_func=lstm_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, q_debug=self.q_debug ) # Create experience buffer self.replay_buffer = ReplayBufferLSTM(1e6) # self.replay_buffer = PrioritizedReplayBuffer(10000, 0.45) # self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.max_replay_buffer_len = args.batch_size self.replay_sample_index = None # Information tracking self.tracker = InfoTracker(self.name, self.args)
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, u_estimation=False): print('in here') self.name = name self.n = 1 #len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] obs_ph_n.append( U.BatchInput(obs_shape_n[agent_index], name="observation0").get()) self.u_estimation = u_estimation # Create all the functions necessary to train the model l = q_train(scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, u_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, u_estimation=self.u_estimation) if self.u_estimation: self.q_train, self.q_update, self.u_update, self.q_debug = l else: self.q_train, self.q_update, self.q_debug = l self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func, policy_name, adversarial): self.name = name self.scope = self.name + "_" + policy_name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.scope, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), adversarial=adversarial, adv_eps=args.adv_eps, adv_eps_s=args.adv_eps_s, num_adversaries=args.num_adversaries, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.scope, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), adversarial=adversarial, adv_eps=args.adv_eps, adv_eps_s=args.adv_eps_s, num_adversaries=args.num_adversaries, grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.policy_name = policy_name self.adversarial = adversarial self.act_space_n = act_space_n self.local_q_func = local_q_func
def __init__(self, name, model_value, model_policy, obs_shape_n, act_space_n, agent_index, args, hparams, summary_writer=None, local_q_func=False, rngseed=None): self.name = name self.rngseed = rngseed self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.hparams = hparams obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput( obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # train critic self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # train policy self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model_policy, q_func=model_value, optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']), grad_norm_clipping=hparams['grad_norm_clipping'], local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed) try: if hparams['test_saving']: self.max_replay_buffer_len = 100 except KeyError: self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len self.replay_sample_index = None self.summary_writer = summary_writer
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # create dummy tensor flow variables to avoid Saver error # TODO: remove this or turn into act function obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) with tf.variable_scope(self.name, reuse=None): self.dummy_var = U.function(obs_ph_n, outputs=tf.Variable(0))
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): # 是否用ddpg训练 self.name = name self.n = len(obs_shape_n) # 总的agent个数 self.agent_index = agent_index # 当前是几号agent self.args = args # cmd传入的训练参数,交互用 obs_ph_n = [] for i in range(self.n): # 用于一批环境数据放入的占位符集合,收集所有agent的observations, # 依据他们observation的shape创造不同大小的批量占位符集合 obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model # 训练节点,更新target网络,字典得到对应输出的q值与target-q值(已经被session激活) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # 得到act,训练策略网络,策略网络的target网络更新,字典给出p值和target策略网络的输出动作值 self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name # name of the agent self.n = len(obs_shape_n) # number of agents self.agent_index = agent_index # Index of the specific agent self.args = args # Settings of hyper-parameters obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype. # [Create all the functions necessary to train the model] # train: U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) # update_target_q: make_update_exp(q_func_vars, target_q_func_vars) # q_values: U.function(obs_ph_n + act_ph_n, q) # target_q_values: U.function(obs_ph_n + act_ph_n, target_q) self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, # String: "agent_1" or "agent_2" or ... make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, # action_space. q_index=agent_index, # Index of the specific agent. q_func=model, # Defined model. optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定 grad_norm_clipping=0.5, # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值 local_q_func=local_q_func, num_units=args.num_units # Hidden layers 隐藏节点数 ) # act: U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # train: U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) # update_target_p: make_update_exp(p_func_vars, target_p_func_vars) # p_values: U.function([obs_ph_n[p_index]], p) # target_act: U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) # 16 self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) #obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), dtype=tf.uint8).get()) #should we specify uint8 instead of default float? # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, # multi-layer perceptron optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, # maddpg or ddpg num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) self.act = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, num_units=args.num_units )
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False, reuse=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=reuse) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=reuse, deterministic=args.benchmark and args.deterministic)
def __init__(self, name, critic_model, policy_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = 4 self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope = self.name, make_obs_ph_n = obs_ph_n, act_space_n = act_space_n, q_index = agent_index, q_func = critic_model, optimizer = tf.train.AdamOptimizer(learning_rate=args['lr']), grad_norm_clipping = 0.5, local_q_func = local_q_func, num_units = args['num_units'] ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope = self.name, make_obs_ph_n = obs_ph_n, act_space_n = act_space_n, p_index = agent_index, p_func = policy_model, q_func = critic_model, optimizer = tf.train.AdamOptimizer(learning_rate=args['lr']), grad_norm_clipping = 0.5, local_q_func = local_q_func, num_units = args['num_units'] ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args['batch_size'] * args['max_episode_len'] self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, replay_buffer, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.critic = Critic(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func) self.actor = Actor(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func) # According to the tensorflow scope p_train and q_train, q_train must be in the front of p_train # Create experience buffer self.replay_buffer = replay_buffer # ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.critic = Critic(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func) self.actor = Actor(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func)
def __init__(self, name, model, state_shape, act_space_n, agent_index, args, local_q_func=False): self.name = name self.n = 1 self.agent_index = agent_index self.args = args obs_ph_n = [] obs_ph_n.append(U.BatchInput(state_shape, name="observation"+str(0)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.min_buffer_size = args.min_buffer_size self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, role="", local_q_func=False): """ Args: name (str): Name of the agent model (function): MLP Neural Network model for the agent. obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents agent_index (int): Agent index number args (argparse.Namespace): Parsed commandline arguments object role (str): Role of the agent i.e. adversary local_q_func (boolean): Flag for using local q function """ super(MADDPGAgentTrainerCCM, self).__init__() self.name = name self.role = role self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args obs_ph_n = [] act_history_ph_n = [] obs_history_ph_n = [] hist = self.args.training_history obs_history_n = [(hist * x[0], ) for x in obs_shape_n] act_history_n = [(hist * act.n, ) for act in act_space_n] # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n] # for act_space in act_space_n: # act_space.n = act_space.n*3 # if act_history_n[0].n != 15: # print("Line 158") for i in range(self.n): obs_ph_n.append( tf_util.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_history_ph_n.append( tf_util.BatchInput(obs_history_n[i], name="observationhistory" + str(i)).get()) act_history_ph_n.append( tf_util.BatchInput(act_history_n[i], name="actionhistory" + str(i)).get()) # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, make_obs_history_n=obs_history_ph_n, make_act_history_n=act_history_ph_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len self.replay_sample_index = None
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, actor_lr=None, critic_lr=None, gamma=None, num_units=None, rb_size=None, batch_size=None, max_episode_len=None, clip_norm=0.5, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # training parameters self.actor_lr = actor_lr if actor_lr else args.lr self.critic_lr = critic_lr if critic_lr else args.lr self.gamma = gamma if gamma else args.gamma self.num_units = num_units if num_units else args.num_units self.rb_size = rb_size if rb_size else args.rb_size self.batch_size = batch_size if batch_size else args.batch_size self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len self.clip_norm = clip_norm # TODO: remove after testing import models.config as Config assert actor_lr == Config.maddpg_train_args['actor_lr'] assert critic_lr == Config.maddpg_train_args['critic_lr'] assert gamma == Config.maddpg_train_args['gamma'] assert num_units == Config.maddpg_train_args['num_hidden'] assert rb_size == Config.maddpg_train_args['rb_size'] assert batch_size == Config.maddpg_train_args['batch_size'] assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps'] assert clip_norm == Config.maddpg_train_args['clip_norm'] obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) self.act, self.p_train, self.p_update, self.p_debug = p_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, p_index=agent_index, p_func=model, q_func=model, optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr), grad_norm_clipping=self.clip_norm, local_q_func=local_q_func, num_units=self.num_units) # Create experience buffer self.replay_buffer = ReplayBuffer(self.rb_size) self.max_replay_buffer_len = self.batch_size * self.max_episode_len self.replay_sample_index = None self.loss_names = [ 'q_loss', 'p_loss', 'mean_target_q', 'mean_rew', 'mean_target_q_next', 'std_target_q' ]
def __init__(self, name, p_model, q_model, obs_shape_n, act_space_n, num_adversaries, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.args = args self.neighbor_n = 2 self.num_adversaries = num_adversaries adj_n = [] obs_ph_n = [] agent_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) adj_n.append( U.BatchInput([ self.neighbor_n, num_adversaries if i < num_adversaries else (self.n - num_adversaries) ], name="adjacency" + str(i)).get()) # Create all the functions necessary to train the model self.q_train, self.q_update, self.q_values, self.target_q_values = q_train( name=self.name, scope=self.name, make_obs_ph_n=obs_ph_n, adj_n=adj_n, act_space_n=act_space_n, num_adversaries=num_adversaries, neighbor_n=self.neighbor_n, q_func=q_model, agent_n=self.n, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) self.act, self.p_train, self.p_update, self.p_values, self.target_act = p_train( name=self.name, scope=self.name, make_obs_ph_n=obs_ph_n, adj_n=adj_n, act_space_n=act_space_n, neighbor_n=self.neighbor_n, p_index=agent_n, p_func=p_model, q_func=q_model, num_adversaries=self.num_adversaries, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, ) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None
def p_train(name, make_obs_ph_n, adj_n, act_space_n, neighbor_n, p_index, p_func, q_func, num_adversaries, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=128, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] agent_n = len(obs_ph_n) vec_n = U.BatchInput([1, neighbor_n], name="vec").get() p_input1 = obs_ph_n[ 0:num_adversaries] if name == "adversaries" else obs_ph_n[ num_adversaries:agent_n] p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[ num_adversaries:agent_n] p_input3 = vec_n # call for actor network # act_space is not good!!!!!!!!!! p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = [] act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_pd_temp = act_pdtype_n[i].pdfromflat( p[i - (0 if name == "adversaries" else num_adversaries)]) act_pd.append(act_pd_temp) act_sample.append(act_pd_temp.sample()) temp = [] for i in range(len(act_pd)): temp.append(act_pd[i].flatparam()) # Is this regularization method correct?????????????????????????????/ p_reg = tf.reduce_mean(tf.square(temp)) act_input_n = act_ph_n + [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_input_n[i] = act_sample[ i - (0 if name == "adversaries" else num_adversaries)] q_input = tf.concat(obs_ph_n + act_input_n, 1) q = [] q_reduce_mean = [] for a in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): index = a if name == "adversaries" else a - num_adversaries temp = q_func(q_input, 1, scope="q_func_%d" % index, reuse=True, num_units=num_units)[:, 0] q.append(temp) q_reduce_mean += temp pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=act_sample, list_output=True) p_values = U.function( p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], p, list_output=True) # target network target_p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, central=True) target_act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - ( 0 if name == "adversaries" else num_adversaries)]).sample()) target_act = U.function( inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=target_act_sample, list_output=True) return act, train, update_target_p, p_values, target_act
def create_obs_ph_n(n_agents, obs_shape_n): obs_ph_n = [] for i in range(n_agents): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) return obs_ph_n
def __init__(self, name, p_policy, p_predict, q_model, obs_shape_n, act_space_n, state_shape_n, agent_index, args, local_q_func=False): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args self.obs_shape = obs_shape_n[agent_index] self.state_shape = state_shape_n[agent_index] self.p_predict = p_predict obs_ph_n = [] obs_next_n = [] obs_pred_n = [] state_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) obs_next_n.append( U.BatchInput(obs_shape_n[i], name="next_obs" + str(i)).get()) obs_pred_n.append( U.BatchInput(obs_shape_n[i], name="pred_obs" + str(i)).get()) state_ph_n.append( U.BatchInput(state_shape_n[i], name="state" + str(i)).get()) # Create all the functions necessary to train the critic net # q_train is used for optimize Q net according to the loss in this batch # q_update is used to update the parameter of target net θ'i = τθi + (1 − τ)θ'i self.q_train, self.q_update, self.q_debug = q_train( scope=self.name, make_obs_ph_n=obs_ph_n, act_space_n=act_space_n, q_index=agent_index, q_func=q_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units) # step return the action and new_state given the obs and state # p_train is used to optimize p Net # p_update is used to update target p net as θ'i = τθi + (1 − τ)θ'i self.step, self.predict, self.p_train, self.p_update, self.p_debug = p_train_recurrent( scope=self.name, make_obs_ph_n=obs_ph_n, make_state_ph_n=state_ph_n, act_space_n=act_space_n, make_obs_next_n=obs_next_n, make_obs_pred_n=obs_pred_n, p_index=agent_index, p_policy=p_policy, p_predict=p_predict, q_func=q_model, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), grad_norm_clipping=0.5, local_q_func=local_q_func, num_units=args.num_units, reuse=tf.AUTO_REUSE) # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None