def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim) # Main outputs from computation graph with tf.variable_scope('main'): self.q, _ = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Set up summary Ops self.test_ops, self.test_vars = self.build_summaries() self.sess = tf.Session( config=tf.ConfigProto( device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "test": self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) variables_all = tf.contrib.framework.get_variables_to_restore() variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name] self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.q, self.sess, input_variables=variables_bn)
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \ actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) # Set up summary Ops self.test_ops, self.test_vars = self.build_summaries() self.sess = tf.Session( config=tf.ConfigProto(device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "main": self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.pi, self.sess)
def __init__(self, observation_space,action_space): obs_dim = observation_space.shape act_dim = action_space.shape # Share information about action space with policy architecture ac_kwargs = dict() ac_kwargs['action_space'] = action_space #ac_kwargs['output_activation'] = tf.tanh # Inputs to computation graph self.x_ph, self.a_ph = core.placeholders_from_spaces(observation_space, action_space) self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph self.pi, self.logp, self.logp_pi, self.v = core.mlp_actor_critic(self.x_ph, self.a_ph, output_activation=tf.tanh,**ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) self.all_phs = [self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph] # Every step, get: action, value, and logprob self.get_action_ops = [self.pi, self.v, self.logp_pi] # Experience buffer steps_per_epoch = 1000 self.local_steps_per_epoch = steps_per_epoch gamma = 0.99 lam = 0.97 self.buf = PPOBuffer(obs_dim, act_dim, self.local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) print var_counts # PPO objectives clip_ratio = 0.2 ratio = tf.exp(self.logp - self.logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(self.adv_ph > 0, (1 + clip_ratio) * self.adv_ph, (1 - clip_ratio) * self.adv_ph) self.pi_loss = -tf.reduce_mean(tf.minimum(ratio * self.adv_ph, min_adv)) self.v_loss = tf.reduce_mean((self.ret_ph - self.v) ** 2) # Info (useful to watch during learning) self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp) # a sample estimate for KL-divergence, easy to compute self.approx_ent = tf.reduce_mean(-self.logp) # a sample estimate for entropy, also easy to compute self.clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32)) pi_lr = 3e-4 vf_lr = 1e-3 pi_optimizer = tf.train.AdadeltaOptimizer(learning_rate=pi_lr) vf_optimizer = tf.train.AdadeltaOptimizer(learning_rate=vf_lr) self.train_pi = pi_optimizer.minimize(self.pi_loss) self.train_v = vf_optimizer.minimize(self.v_loss) self.train_pi_iters = 80 self.train_v_iters = 80 self.target_kl = 0.01 self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def __init__(self): self.sess = tf.Session() self.memory = replay_buffer(max_length=1e5) self.tau = 0.995 self.gamma = 0.99 self.state_size = 33 self.output_size = 4 self.action_limit = 1.0 self.hidden = [400, 300] self.batch_size = 100 self.pi_lr = 1e-4 self.q_lr = 1e-4 self.noise = OU_noise(self.output_size, 1) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.mlp_actor_critic( self.x_ph, self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\ self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.target = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.q_pi_targ) self.pi_loss = -tf.reduce_mean(self.q_pi) self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5 self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.v_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize( self.pi_loss, var_list=cr.get_vars('main/pi')) self.v_train = self.v_optimizer.minimize( self.v_loss, var_list=cr.get_vars('main/q')) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, = core.placeholders( opt.obs_shape, opt.act_shape, opt.obs_shape) # ------ if opt.alpha == 'auto': log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha_v = tf.exp(log_alpha) else: alpha_v = opt.alpha # ------ # Main outputs from computation graph with tf.variable_scope('main'): self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \ = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v, hidden_sizes=opt.hidden_size, action_space=opt.act_space, phase=False, use_bn=opt.use_bn, coefficent_regularizer=opt.c_regularizer, model=opt.model) # Set up summary Ops self.test_ops, self.test_vars = self.build_summaries() self.sess = tf.Session(config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "main": self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) variables_all = tf.contrib.framework.get_variables_to_restore() variables_bn = [ v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name ] self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.pi, self.sess, input_variables=variables_bn)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.gamma = env_set['gamma'] self.lamda = 0.97 self.hidden = env_set['hidden'] self.pi_lr = 0.00025 self.v_lr = 0.00025 self.ppo_eps = 0.2 self.epoch = 10 self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph, self.old_value = \ cr.placeholders(self.state_size, self.output_size, None, None, None, None) self.pi, self.logp, self.logp_pi, self.v = cr.ppo_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=None, output_size=self.output_size) self.all_phs = [ self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph, self.old_value ] self.get_action_ops = [self.pi, self.v, self.logp_pi] self.ratio = tf.exp(self.logp - self.logp_old_ph) self.min_adv = tf.where(self.adv_ph > 0, (1.0 + self.ppo_eps) * self.adv_ph, (1.0 - self.ppo_eps) * self.adv_ph) self.pi_loss = -tf.reduce_mean( tf.minimum(self.ratio * self.adv_ph, self.min_adv)) self.clipped_value_loss = self.old_value + tf.clip_by_value( self.v - self.old_value, -self.ppo_eps, self.ppo_eps) self.v_loss1 = (self.target_ph - self.clipped_value_loss)**2 self.v_loss2 = (self.target_ph - self.v)**2 self.v_loss = 0.5 * tf.reduce_mean( tf.maximum(self.v_loss1, self.v_loss2)) self.train_pi = tf.train.AdamOptimizer(self.pi_lr).minimize( self.pi_loss) self.train_v = tf.train.AdamOptimizer(self.v_lr).minimize(self.v_loss) self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp) self.approx_ent = tf.reduce_mean(-self.logp) self.sess.run(tf.global_variables_initializer())
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.gamma = env_set['gamma'] self.lamda = 0.95 self.hidden = env_set['hidden'] self.pi_lr = env_set['pi_lr'] #0.00025 self.v_lr = env_set['q_lr'] #0.00025 self.ppo_eps = 0.2 self.train_pi_iter = 10 self.train_v_iter = 10 self.target_kl = 0.1 self.step_per_epoch = 2048 self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph = \ cr.placeholders(self.state_size, self.output_size, None, None, None) self.pi, self.logp, self.logp_pi, self.v, self.std = cr.ppo_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=None, output_size=self.output_size) self.all_phs = [ self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph ] self.get_action_ops = [self.pi, self.v, self.logp_pi] self.ratio = tf.exp(self.logp - self.logp_old_ph) self.min_adv = tf.where(self.adv_ph > 0, (1.0 + self.ppo_eps) * self.adv_ph, (1.0 - self.ppo_eps) * self.adv_ph) self.pi_loss = -tf.reduce_mean( tf.minimum(self.ratio * self.adv_ph, self.min_adv)) self.v_loss = tf.reduce_mean((self.target_ph - self.v)**2) self.train_pi = tf.train.AdamOptimizer(self.pi_lr).minimize( self.pi_loss) self.train_v = tf.train.AdamOptimizer(self.v_lr).minimize(self.v_loss) self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp) self.approx_ent = tf.reduce_mean(-self.logp) self.sess.run(tf.global_variables_initializer())
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256, trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph\ raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph') rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40]) max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph') seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,)) # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out # when computing loss seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len)) # rescaled_image_ph This is a ph because we want to be able to pass in value to this node manually rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph') a_ph = core.placeholders_from_spaces( env.action_space)[0] conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5], stride=2) image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5], stride=2)) rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph') # Main outputs from computation graph action_encoder_matrix = np.load(r'encoder.npy') pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic( image_out, a_ph, rew_ph, rnn_state_ph, gru_units, max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, rnn_state, logits] # Experience buffer buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) # Need to mask out the padded zeros when computing loss sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len) # Convert bool tensor to int tensor with 1 and 0 sequence_mask = tf.where(sequence_mask, np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)), np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len))) # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape # it back pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask))) pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio))) aaa = (ret_ph - v)**2 v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask))) ccc = tf.reshape(v_loss_vec, tf.shape(v)) v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v))) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v}) def update(): print(f'Start updating at {datetime.now()}') inputs = {k:v for k,v in zip(all_phs, buf.get())} inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32) inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len) inputs[seq_len_ph] = buf.seq_len_buf pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) buf.reset() # Training print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}') for i in range(train_pi_iters): _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs) print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}') logger.store(StopIter=i) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) print(f'Updating finished at {datetime.now()}') start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 def recenter_rgb(image, min=0.0, max=255.0): ''' :param image: :param min: :param max: :return: an image with rgb value re-centered to [-1, 1] ''' mid = (min + max) / 2.0 return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image) o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): # TODO: tweek settings to match the paper # TODO: find a way to generate mazes last_a = np.array(0) last_r = np.array(r) last_rnn_state = np.zeros((1, gru_units), np.float32) step_counter = 0 for episode in range(episodes_per_trial): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) action_dict = defaultdict(int) # dirty hard coding to make it print in order action_dict[0] = 0 action_dict[1] = 0 action_dict[2] = 0 for step in range(max_ep_len): a, v_t, logp_t, rnn_state_t, logits_t = sess.run( get_action_ops, feed_dict={ rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, # v_rnn_state_ph: last_v_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) action_dict[a[0]] += 1 # save and log buf.store(o_rescaled, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) step_counter += 1 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) ep_ret += r ep_len += 1 last_a = a[0] last_r = np.array(r) last_rnn_state = rnn_state_t terminal = d or (ep_len == max_ep_len) if terminal or (step==n-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}') break print(action_dict) if step_counter < episodes_per_trial * max_ep_len: buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter) buf.seq_len_buf[trial] = step_counter # pad zeros to sequence buffer after each trial # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise = OU_noise(self.output_size, 1) self.noise_clip = 0.1 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target', reuse=True): self.eps = tf.random_normal(tf.shape(self.pi_targ), stddev=self.target_noise) self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip, self.noise_clip) self.a_prev = self.pi_targ + self.epsilon self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit, self.action_limit) _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a2, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.q_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.v_train = self.pi_optimizer.minimize(self.v_loss, var_list=self.q_params) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_critic = core.get_ddpg_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs actions = [] epoch_actions = [] rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 epoch_actions.append(pi_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) actions.append(np.mean(epoch_actions)) epoch_actions = [] rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True) test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('QVals', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep util.plot_actions(test_actions, act_high, logger.output_dir + '/actions%s.png' % epoch) logger.save_state( { "actions": actions, "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim, z_type, act_noise, test_without_state, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, z_dim, obs_dim, None, None) actor_critic = core.get_iac_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n' % var_counts) # Bellman backup for Q and V function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) min_q_pi = tf.minimum(q1_pi, q2_pi) v_backup = tf.stop_gradient(min_q_pi) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2) q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2) v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2) value_loss = q1_loss + q2_loss + v_loss # Separate train ops for pi, q policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_policy_op = policy_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) if ac_kwargs["pi_separate"]: train_policy_emb_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/emb')) train_policy_d_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/d')) train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q') + get_vars('main/v')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def sample_z(size): if z_type == "uniform": return np.random.random_sample(size=size) elif z_type == "gaussian": return np.random.normal(size=size) else: raise Exception("z_type error") def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={ x_ph: o.reshape(1, -1), z_ph: sample_z((1, z_dim)) })[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) if test_without_state: _, real_a = get_action(np.zeros(o.shape), 0) else: _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } feed_dict[z_ph] = sample_z((batch_size, z_dim)) # Policy Learning update for key in feed_dict: feed_dict[key] = np.repeat(feed_dict[key], q_pi_sample_size, axis=0) feed_dict[z_ph] = sample_z( (batch_size * q_pi_sample_size, z_dim)) if ac_kwargs["pi_separate"]: if len(rewards) % 2 == 0: outs = sess.run([pi_loss, train_policy_emb_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_d_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_op], feed_dict) logger.store(LossPi=outs[0]) # Q-learning update outs = sess.run([q1_loss, v_loss, q1, v, train_value_op], feed_dict) logger.store(LossQ=outs[0], LossV=outs[1], ValueQ=outs[2], ValueV=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True)[0] test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('ValueQ', average_only=True) logger.log_tabular('ValueV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep sess.run(target_update, feed_dict) logger.save_state( { "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def asac_v2(actor_critic=core.mlp_actor_critic, seed=0, ac_kwargs=dict(), steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=0.001, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() tf.set_random_seed(seed) np.random.seed(seed) env = baxter() obs_dim = env.obs_dim act_dim = env.act_dim # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = 0.1 # Share information about action space with policy architecture # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in [ 'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main' ]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R dQ = Q_backup * (R - Q) pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars( 'main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss, v_targ ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch obs1_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) obs2_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) act_epi = np.zeros([2 * max_ep_len, act_dim], dtype=np.float32) rew_epi = np.zeros([2 * max_ep_len], dtype=np.float32) done_epi = np.zeros([2 * max_ep_len], dtype=np.float32) ptr_epi = 0 alpha_update = False epi_num = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o["feature"]) else: a = 0.1 - np.random.sample(act_dim) * 0.2 # Step the env o2, r = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o["feature"], a, r, o2["feature"], d) obs1_epi[ptr_epi] = o["feature"] obs2_epi[ptr_epi] = o2["feature"] act_epi[ptr_epi] = a rew_epi[ptr_epi] = r done_epi[ptr_epi] = d ptr_epi += 1 # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): epi_num += 1 print("epi : {}, alpha : {}, return : {}".format( epi_num, alpha_t, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ """ rew_epi[ptr_epi] = sess.run(R, feed_dict={x_ph: [o]})[0] rets_epi = scipy.signal.lfilter([1], [1, float(-gamma)], rew_epi[::-1], axis=0)[::-1] rets_epi = rets_epi[:-1] """ """ v_epi = sess.run(R, feed_dict={x_ph: obs_epi}) q_epi, adv_epi = sess.run([Q, adv], feed_dict={x_ph: obs_epi[:-1], a_ph: act_epi}) rets_epi = rew_epi + gamma*v_epi[1:] if t > start_steps: alpha.update_alpha(adv_epi, np.mean(rets_epi*(v_epi[:-1]-q_epi)) > 0) alpha_t = alpha() print("{} {}".format(np.mean(rets_epi*(v_epi[:-1]-q_epi)), alpha_t)) """ if ptr_epi >= max_ep_len: feed_dict = { x_ph: obs1_epi[:ptr_epi], x2_ph: obs2_epi[:ptr_epi], a_ph: act_epi[:ptr_epi], r_ph: rew_epi[:ptr_epi], d_ph: done_epi[:ptr_epi] } adv_epi, Q_epi, R_epi = sess.run([adv, Q, R], feed_dict) R_next_epi = sess.run(R, feed_dict={x_ph: obs2_epi[:ptr_epi]}) dQ_epi = (rew_epi[:ptr_epi] + gamma * (1 - done_epi[:ptr_epi]) * R_next_epi) * (R_epi - Q_epi) """ ret_epi = np.zeros([ptr_epi], dtype=np.float32) for i in np.arange(ptr_epi)[::-1]: if i == ptr_epi - 1: R_next_epi = sess.run(R, feed_dict={x_ph: [obs2_epi[i]]})[0] ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*R_next_epi else: ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*ret_epi[i+1] dQ_epi = ret_epi * (R_epi - Q_epi) """ if t > start_steps: alpha.update_alpha(adv_epi, np.mean(dQ_epi) > 0) alpha_t = alpha() print("{} {}".format(np.mean(dQ_epi), alpha_t)) obs1_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) obs2_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) act_epi = np.zeros([max_ep_len * 2, act_dim], dtype=np.float32) rew_epi = np.zeros([max_ep_len * 2], dtype=np.float32) done_epi = np.zeros([max_ep_len * 2], dtype=np.float32) ptr_epi = 0 """ batch = replay_buffer.sample_batch(1000) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t} dQ_epi = sess.run(dQ, feed_dict) """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def s2vg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, model_lr=3e-4, value_lr=1e-3, pi_lr=3e-4, alpha=0.4, batch_size=100, start_steps=1000,max_ep_len=1000, save_freq=1, train_model_epoch=1, test_freq=10, exp_name='',env_name='',save_epoch=100): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) transition , r_rm, transition_pi ,r_rm_pi, v_prime = core.reward_dynamic_model(x_ph, a_ph, pi, **ac_kwargs) # Target value network for updates with tf.variable_scope('target'): _, _, _, _, _, _, _,v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # TD3 style Q function updates min_q_pi = tf.minimum(q1_pi, q2_pi) q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) r_backup = r_ph transition_backup = x2_ph r_loss = 0.5 * tf.reduce_mean((r_backup-r_rm)**2) transition_loss = 0.5 * tf.reduce_mean((transition_backup - transition)**2) model_loss = r_loss+transition_loss q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss pi_loss = r_rm_pi - alpha*logp_pi + gamma*(1-d_ph)*v_prime # model train op model_optimizer = tf.train.AdamOptimizer(learning_rate=model_lr) model_params = get_vars('main/dm') + get_vars('main/rm') train_model_op = model_optimizer.minimize(model_loss, var_list=model_params) # policy train op pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) with tf.control_dependencies([train_model_op]): train_pi_op = pi_optimizer.minimize(-pi_loss, var_list=get_vars('main/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=value_lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) saver = tf.compat.v1.train.Saver() sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(epoch,n=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi total_reward = 0 for j in range(n): # repeat n times o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 total_reward += ep_ret print('The '+str(epoch)+' epoch is finished!') print('The test reward is '+str(total_reward/n)) return total_reward/n o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs reward_recorder = [] for t in range(total_steps): """ The algorithm would take total_steps totally in the training """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len==max_ep_len else d replay_buffer.store(o, a, r, o2, d) o = o2 if t // steps_per_epoch > train_model_epoch: # train 5 steps of Q, V, and pi. # train 1 step of model for j in range(5): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done']} _ = sess.run(step_ops, feed_dict) outs = sess.run(train_model_op, feed_dict) else: # pretrain the model batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(train_model_op, feed_dict) if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch > train_model_epoch and epoch % test_freq == 0: # test the agent when we reach the test_freq, save the experiment result reward_test = test_agent(epoch) reward_recorder.append(reward_test) reward_nparray = np.asarray(reward_recorder) np.save(str(exp_name)+'_'+str(env_name)+'_'+str(save_freq)+'.npy',reward_nparray) if epoch % save_epoch == 0: # save the model saver.save(sess, str(exp_name)+'_'+str(env_name),global_step=epoch)
def add_place_holders(self): self.x_ph, self.a_ph = core.placeholders(self.obs_dim, self.act_dim)
def sac(env_name='Ant-v2', actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ tf.set_random_seed(seed) np.random.seed(seed) env = gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs tf.summary.FileWriter('./log/', graph=tf.get_default_graph()) replay_buffer = ReplayBuffer(obs_dim=env.observation_space.shape[0], act_dim=env.action_space.shape[0], size=replay_size) episode = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o)[0] else: a = np.clip(env.action_space.sample(), -1, 1) # Step the env env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ episode += 1 for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) print("episode %d, reward %d" % (episode, ep_ret)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 sess.close()
def vpg(env_config, ac_type, ac_kwargs, gamma, lam, epochs, steps_per_epoch, lr, train_v_iters, max_ep_len, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None, None) actor_critic = gaussian_mlp_actor_critic pi, logp, logp_pi, v = actor_critic(obs_ph, a_ph, **ac_kwargs) all_phs = [obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] # Experience buffer buf = VPGBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): buffer_data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, buffer_data)} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) sess.run(train_pi, feed_dict=inputs) # Training for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, v_new = sess.run( [pi_loss, v_loss, approx_kl, v], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={obs_ph: o.reshape(1, -1)}) buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) delta = np.exp(a[0]) delta = np.clip(delta, 0.9, 1.1) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: last_val = sess.run(v, feed_dict={obs_ph: o.reshape(1, -1)}) #print(last_val) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph = core.placeholders( opt.obs_shape, opt.act_shape, opt.obs_shape) self.r_ph, self.d_ph, self.logp_pi_ph = core.placeholders( (opt.Ln, ), (opt.Ln, ), (opt.Ln, )) # ------ if opt.alpha == 'auto': log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha_v = tf.exp(log_alpha) else: alpha_v = opt.alpha # ------ # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, self.logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \ = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_, q1_mu_, q2_mu_ \ = actor_critic(self.x2_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) # ------ if isinstance(alpha_v, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + opt.target_entropy)) alpha_optimizer = tf.train.AdamOptimizer( learning_rate=opt.lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # ------ # Min Double-Q: if opt.use_max: min_q_pi = tf.minimum(q1_mu_, q2_mu_) else: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # x2 # get rid of abnormal explosion # min_q_pi = tf.clip_by_value(min_q_pi, -300.0, 900.0) #### n-step backup q_backup = tf.stop_gradient(min_q_pi) for step_i in reversed(range(opt.Ln)): q_backup = self.r_ph[:, step_i] + \ opt.gamma * (1 - self.d_ph[:, step_i]) * (-alpha_v * self.logp_pi_ph[:, step_i] + q_backup) #### # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) self.value_loss = q1_loss + q2_loss value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') bn_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(bn_update_ops): train_value_op = value_optimizer.minimize( self.value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha_v, Number): self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha_v), train_value_op, target_update ] else: self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, alpha_v, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session(config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.value_loss, self.sess)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.5 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi, _ = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, ) with tf.variable_scope('target'): self.pi_targ, self.q1_double_targ, self.q2_double_targ, self.q1_pi_targ, self.q2_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, pi_q_noise=self.target_noise, noise_clip=self.noise_clip) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ) #self.min_q_targ = tf.minimum(self.q1_double_targ,self.q2_double_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.value_ops = [self.v_loss, self.train_value_op] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) self.saver = tf.train.Saver()
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, n_episodes=10000, replay_size=int(1e6), gamma=0.99, show_steps=50, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=200, logger_kwargs=dict(), save_freq=1): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=5): for j in range(n): o, r, d, ep_ret, ep_len, ep_cost = test_env.reset( ), 0, False, 0, 0, 0 while not (d or (ep_len == 5 * max_ep_len)): # Take deterministic actions at test time (noise_scale=0) test_env.render() a = get_action(o, 0) o, r, d, _, c = test_env.step(a + 0.5 * np.random.rand(), 1) ep_ret += (r - c) ep_len += 1 ep_cost += c test_env.close() print( "\n avg reward {} and episode length {} over {} trials, cost/step {}" .format(ep_ret / n, ep_len / n, n, ep_cost / ep_len)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for t in range(start_steps): a = env.action_space.sample() o2, r, d, _, c = env.step(a, 1) r -= c replay_buffer.store(o, a, r, o2, d) o = o2 if d: o = env.reset() fails = 0 # Main loop: collect experience in env and update/log each epoch for t in itertools.count(): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ a = get_action(o, act_noise) # Step the env o2, r, d, _, c = env.step(a, 1) r -= c ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if t == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 print("\rSteps {:3}, fails {}".format(t, fails), end="") if t % max_ep_len == 0: """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(max_ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) if d: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 fails += 1 # End of epoch wrap-up if t > 0 and t % (show_steps * max_ep_len) == 0: # Test the performance of the deterministic version of the agent. test_agent()
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 8 self.target_update_tau = 0.995 self.gamma = 0.99 self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = 1e-4 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.1 self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.dipg_mlp_actor_critic( x=self.x_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): _, _, self.q_pi_targ = cr.dipg_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, pi_q_noise=self.target_noise) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\ + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.q_pi_targ) self.pi_loss = -tf.reduce_mean(tf.reduce_mean(self.q_pi)) self.clip_tau = 5e-2 theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2), [1, 1, self.support_size]) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) Huber_loss = tf.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) tau = tf.tile(tf.expand_dims(self.tau_ph, axis=2), [1, 1, self.support_size]) bellman_errors = logit_valid_tile - theta_loss_tile Loss = ( tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) * Huber_loss) self.v_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1))) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) grad = self.pi_optimizer.compute_gradients(self.pi_loss, var_list=self.pi_params) grad = [(gr / self.support_size, var) for gr, var in grad] self.train_pi_op = self.pi_optimizer.apply_gradients(grad) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign( v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def ppo(env_config, ac_type, ac_kwargs, clip_ratio, epochs, steps_per_epoch, optimizer, lr, train_pi_iters, max_ep_len, target_kl, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high obs_ph, a_ph, adv_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None) all_phs = [obs_ph, a_ph, adv_ph, logp_old_ph] actor_critic = get_ppo_actor_critic(ac_type) pi, logp, logp_pi = actor_critic(obs_ph, a_ph, **ac_kwargs) # Experience buffer buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # Optimizers if optimizer == "adam": train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) elif optimizer == "sgd": train_pi = tf.train.GradientDescentOptimizer( learning_rate=lr).minimize(pi_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): print(sess.run(tf.trainable_variables())) data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, data[:4])} pi_l_old, ent = sess.run([pi_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # Log changes from update pi_l_new, kl, cf = sess.run([pi_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) print(tf.trainable_variables()) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): episode_count = 0 ep_actions = [] for t in range(steps_per_epoch): a, logp_t = sess.run([pi, logp_pi], feed_dict={obs_ph: o.reshape(1, -1)}) delta = np.exp(a[0]) delta = np.clip(delta, 0.95, 1.05) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) buf.store(o, a, r, logp_t) ep_actions.append(real_action) episode_actions.append(real_action) episode_obs.append(o) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: buf.finish_path() logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) util.plot_seq_obs_and_actions( episode_obs, episode_actions, act_high, logger.output_dir + '/episode_actions_%d_%d.png' % (epoch, episode_count)) episode_count += 1 episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() util.plot_actions(ep_actions, act_high, logger.output_dir + '/ep_actions%d.png' % epoch)
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.num_worker = 20 self.noise = OU_noise(self.output_size, self.num_worker) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \ cr.sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) with tf.variable_scope('target'): _, _, _, _, _, _, _, self.v_targ = \ cr.sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) self.q_backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.v_targ) self.v_backup = tf.stop_gradient(self.min_q_pi - self.alpha * self.logp_pi) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi) self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2) self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2) self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1, self.q2, self.v, self.logp_pi, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, control_policy=ControlPolicy, n_episodes=10000, replay_size=int(1e6), gamma=0.99, show_steps=50, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=200, logger_kwargs=dict(), save_freq=1): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ctrl_pol = control_policy(env) # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=5): tot_len, tot_ret = 0, 0 cost, cost_ctrl = 0, 0 for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 o_ctrl = np.array(o) while not (d or (ep_len == 5 * max_ep_len)): # Take deterministic actions at test time (noise_scale=0) test_env.render() a_ctrl = np.array([ctrl_pol.predict(o_ctrl)]) o_ctrl, _, _, info = test_env.step(a_ctrl, 0) cost_ctrl += info["cost"] a = get_action(o, 0) o, r, d, info = test_env.step(a, 1) cost += info["cost"] ep_len += 1 tot_len += ep_len test_env.close() print( "\n avg reward {:.5} and episode length {} over {} trials, cost/step rl/lqr {:.5}/{:.5}" .format((tot_len - cost) / n, tot_len / n, n, cost / tot_len, cost_ctrl / tot_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_ctrl = np.array(o) #env.state[0] for t in range(start_steps): #a = env.action_space.sample() a = np.array([ctrl_pol.predict(o)]) o2, r, d, info = env.step(a, 1) r -= info["cost"] replay_buffer.store(o, a, r, o2, d) o = o2 if d: o = env.reset() fails = 0 takeover = False cost, cost_ctrl = 0, 0 retrain_steps = 0 show = False # Setup plotting # times = [] # plt.ion() # fig, ax = plt.subplots() # plot = ax.plot([], []) # costs = [] # plot_ctrl = ax.plot([], []) # ctrl_costs = [] # ax.legend(["ddpg cost", "lqr cost"]) # ax.set_xlabel("time") # ax.set_ylabel("cost") # Main loop: collect experience in env and update/log each epoch for t in itertools.count(): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if show > 0: env.render(takeover=takeover) # Step lqr a_ctrl = np.array([ctrl_pol.predict(o_ctrl)]) o_ctrl, _, _, info = env.step(a_ctrl, 0) cost_ctrl += info["cost"] # Step ddpg scaler = min(1, 0.1 + t / 100000) takeover = np.abs(o[2]) > 0.5 * scaler or np.abs(o[0]) > 0.7 * scaler # takeover = False if takeover: a = np.array([ctrl_pol.predict(o)]) else: a = get_action(o, act_noise) o2, r, d, info = env.step(a, 1) cost += info["cost"] r -= info["cost"] retrain_steps += 1 ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if t==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 print( "\rSteps {:5}, fails {:3}, ep_len {:5}, disturbance {:7.3}, cost rl/lqr {:7.3}/{:7.3}" .format(t, fails, ep_len, info["disturbance"] if info["push"] else 0.0, cost / retrain_steps, cost_ctrl / retrain_steps), end="") if np.random.rand() * max_ep_len < 1: """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(max_ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) # cost /= retrain_steps # cost_ctrl /= retrain_steps # costs.append(cost) # ctrl_costs.append(cost_ctrl) # times.append(0.02 * (t + start_steps)) # ax.plot(times, costs, 'r-', times, ctrl_costs, 'b--') # fig.canvas.draw() # plt.pause(0.005) cost = 0 cost_ctrl = 0 retrain_steps = 0 show -= 1 env.state[0] = np.array(env.state[1]) o_ctrl = env.state[0] print() if d: o, r, d, ep_len = env.reset(), 0, False, 0 o_ctrl = np.array(o) fails += 1 # End of epoch wrap-up if t > 0 and t % (show_steps * max_ep_len) == 0: # Test the performance of the deterministic version of the agent. test_agent() show = 5
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Target value network with tf.variable_scope('target'): self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main']) print('\nNumber of parameters: total: %d\n' % var_counts) a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim) q_value = tf.reduce_sum(self.q * a_one_hot, axis=1) # DDQN online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim) q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1) # DQN # q_target = tf.reduce_max(self.q_next, axis=1) # Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target) # q losses q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') train_value_op = value_optimizer.minimize(q_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step self.step_ops = [q_loss, self.q, train_value_op, target_update] # Initializing targets to match main variables self.target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session( config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( q_loss, self.sess)
def ddpg(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, test=False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is #irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) saver = tf.train.Saver() save_path = './saved_model/' + env_name + '/test' def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def save(saver, sess): if not os.path.exists('./saved_model/' + env_name): os.mkdir('./saved_model/' + env_name) ckpt_path = saver.save(sess, save_path) #print('Save ckpt file: {}'.format(ckpt_path)) def load(saver, sess): if os.path.exists('./saved_model/' + env_name): saver.restore(sess, save_path) print('Load model complete.') else: print('There is no saved model.') if test is False: start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): #logger.save_state({'env': env}, None) save(saver, sess) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() #save(saver, sess) else: load(saver, sess) test_logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 num_episodes = 100 render = True max_ep_len = 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o, 0) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): test_logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 test_logger.log_tabular('EpRet', with_min_and_max=True) test_logger.log_tabular('EpLen', average_only=True) test_logger.dump_tabular()
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 8 self.target_update_tau = 0.995 self.gamma = 0.99 self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = 1e-4 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = cr.dipg_sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, tau= self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size ) with tf.variable_scope('target'): _, _, _, _, _, _, self.v_targ = cr.dipg_sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q = tf.where(tf.less(tf.reduce_mean(self.q1_pi),tf.reduce_mean(self.q2_pi)),self.q1_pi,self.q2_pi) self.q_backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\ + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.v_targ) self.v_backup = tf.stop_gradient(self.min_q\ - self.alpha*tf.tile(tf.expand_dims(self.logp_pi,axis=1),[1,self.support_size])) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - tf.reduce_mean(self.q1_pi*tf.square(self.tau_ph))) tau = self.tau_ph inv_tau = 1 - tau tau = tf.tile(tf.expand_dims(tau, axis=1), [1, self.support_size, 1]) inv_tau = tf.tile(tf.expand_dims(inv_tau, axis=1), [1, self.support_size, 1]) logit_valid_tile = tf.tile(tf.expand_dims(self.q_backup, axis=1), [1, self.support_size, 1]) theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.q1_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.q2_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.v, axis=2), [1, 1, self.support_size]) logit_valid_tile = tf.tile(tf.expand_dims(self.v_backup, axis=1), [1, self.support_size, 1]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.v_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize(self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([tf.assign(v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))]) self.step_ops = [self.pi_loss, self.value_loss, self.train_pi_op, self.train_value_op, self.target_update] self.target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(cr.get_vars('main/v'), cr.get_vars('target/v'))]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 64 self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = env_set['batch_size'] self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.kappa = 1.0 self.risk_factor = -1.0 self.random_risk = False self.target_noise = 0.2 self.noise_clip = 0.5 tf.set_random_seed(10) self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size,self.state_size, None, None) self.risk_factor_ph = tf.placeholder(tf.float32) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi, self.q2_pi = cr.dqpg_td3_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) with tf.variable_scope('target'): _, _, _, self.q1_pi_targ, self.q2_pi_targ = cr.dqpg_td3_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size, pi_q_noise=self.target_noise, noise_clip=self.noise_clip) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ) self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\ + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.min_q_targ) self.quantile_weight = 1.0 - self.risk_factor_ph*\ (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0) self.pi_loss = -tf.reduce_mean( tf.reduce_mean(self.q1_pi * self.quantile_weight)) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) tau = tf.reshape( tf.range(0.5 / self.support_size, 1, 1 / self.support_size), [1, self.support_size]) tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size]) theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size]) #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Logcosh = bellman_errors + tf.math.softplus( -2. * bellman_errors) - tf.log(2.) Loss = tf.abs(tau - tf.stop_gradient(tf.to_float( bellman_errors < 0))) * Logcosh self.v1_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size]) #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Logcosh = bellman_errors + tf.math.softplus( -2. * bellman_errors) - tf.log(2.) Loss = tf.abs(tau - tf.stop_gradient(tf.to_float( bellman_errors < 0))) * Logcosh self.v2_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1)) self.v_loss = self.v1_loss + self.v2_loss self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.value_ops = [self.v_loss, self.train_value_op] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) print( self.sess.run(self.quantile_weight, feed_dict={self.risk_factor_ph: self.risk_factor})) self.saver = tf.train.Saver()
def vpg( env_fn, actor_critic, ac_kwargs=dict(), # ac_kwargs 存储了网络结构的参数 seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, lam=0.97, # gamma, lambda 的设置 pi_lr=3e-4, vf_lr=1e-3, # 学习率的设置 train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) # num_procs 是CPU个数 buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print( sess.run([ logp, logp_old_ph, tf.reduce_mean(approx_kl), tf.reduce_mean(logp - logp_old_ph) ], feed_dict=inputs)) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) if epoch == epochs - 1: env.render() # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()