def add_set_get_pi_params(self, pi_name='policy_network'): self.pi_params = core.get_vars(scope=pi_name) print(self.pi_params) self.gradient = core.flat_grad(self.loss, self.pi_params) print(self.gradient) self.v_ph = tf.placeholder(tf.float32, shape=self.gradient.shape) # Symbols for getting and setting params self.get_pi_params = core.flat_concat(self.pi_params) self.set_pi_params = core.assign_params_from_flat( self.v_ph, self.pi_params)
def __init__(self): self.sess = tf.Session() self.memory = replay_buffer(max_length=1e5) self.tau = 0.995 self.gamma = 0.99 self.state_size = 33 self.output_size = 4 self.action_limit = 1.0 self.hidden = [400, 300] self.batch_size = 100 self.pi_lr = 1e-4 self.q_lr = 1e-4 self.noise = OU_noise(self.output_size, 1) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.mlp_actor_critic( self.x_ph, self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\ self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.target = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.q_pi_targ) self.pi_loss = -tf.reduce_mean(self.q_pi) self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5 self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.v_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize( self.pi_loss, var_list=cr.get_vars('main/pi')) self.v_train = self.v_optimizer.minimize( self.v_loss, var_list=cr.get_vars('main/q')) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def add_optimizer_op(self): """ Set 'self.train_op' using AdamOptimizer HINT: Use self.lr, and minimize self.loss """ ###################################################### # The if is important: do not create 2 train_op on same loss otherwise it messes TF if self.config.use_sgd is True: self.sgd_lr_placeholder = tf.placeholder(dtype=tf.float32) sgd_optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.sgd_lr_placeholder) self.sgd_train_op = sgd_optimizer.minimize( self.loss, var_list=core.get_vars(scope='policy_network')) #self.sgd_train_op = sgd_optimizer.minimize(self.loss) print("LR={}".format(self.lr)) self.sgd_lr = copy.copy(self.lr) # start lr else: self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 8 self.target_update_tau = 0.995 self.gamma = 0.99 self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = 1e-4 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.1 self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.dipg_mlp_actor_critic( x=self.x_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): _, _, self.q_pi_targ = cr.dipg_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, pi_q_noise=self.target_noise) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\ + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.q_pi_targ) self.pi_loss = -tf.reduce_mean(tf.reduce_mean(self.q_pi)) self.clip_tau = 5e-2 theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2), [1, 1, self.support_size]) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) Huber_loss = tf.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) tau = tf.tile(tf.expand_dims(self.tau_ph, axis=2), [1, 1, self.support_size]) bellman_errors = logit_valid_tile - theta_loss_tile Loss = ( tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) * Huber_loss) self.v_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1))) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) grad = self.pi_optimizer.compute_gradients(self.pi_loss, var_list=self.pi_params) grad = [(gr / self.support_size, var) for gr, var in grad] self.train_pi_op = self.pi_optimizer.apply_gradients(grad) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign( v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_critic = core.get_ddpg_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs actions = [] epoch_actions = [] rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 epoch_actions.append(pi_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) actions.append(np.mean(epoch_actions)) epoch_actions = [] rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True) test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('QVals', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep util.plot_actions(test_actions, act_high, logger.output_dir + '/actions%s.png' % epoch) logger.save_state( { "actions": actions, "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise = OU_noise(self.output_size, 1) self.noise_clip = 0.1 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target', reuse=True): self.eps = tf.random_normal(tf.shape(self.pi_targ), stddev=self.target_noise) self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip, self.noise_clip) self.a_prev = self.pi_targ + self.epsilon self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit, self.action_limit) _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a2, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.q_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.v_train = self.pi_optimizer.minimize(self.v_loss, var_list=self.q_params) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \ actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = \ actor_critic(self.x2_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) ###### if opt.alpha == 'auto': target_entropy = (-np.prod(opt.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer( learning_rate=opt.lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - opt.alpha * logp_pi2) q_backup = self.r_ph + opt.gamma * (1 - self.d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(opt.alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) self.value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize( self.value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): self.target_update = tf.group([ tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # TODO # self.grads = self.optimizer.compute_gradients(self.cross_entropy) # self.grads_placeholder = [(tf.placeholder( # "float", shape=grad[1].get_shape()), grad[1]) # for grad in self.grads] # All ops to call during one training step if isinstance(opt.alpha, Number): self.step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(opt.alpha), train_pi_op, train_value_op, self.target_update ] else: self.step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, opt.alpha, train_pi_op, train_value_op, self.target_update, train_alpha_op ] # Initializing targets to match main variables self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session( config=tf.ConfigProto(device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.value_loss, self.sess)
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Target value network with tf.variable_scope('target'): self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main']) print('\nNumber of parameters: total: %d\n' % var_counts) a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim) q_value = tf.reduce_sum(self.q * a_one_hot, axis=1) # DDQN online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim) q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1) # DQN # q_target = tf.reduce_max(self.q_next, axis=1) # Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target) # q losses q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') train_value_op = value_optimizer.minimize(q_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step self.step_ops = [q_loss, self.q, train_value_op, target_update] # Initializing targets to match main variables self.target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session( config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( q_loss, self.sess)
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, test=False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is #irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) saver = tf.train.Saver() save_path = './saved_model/' + env_name + '/test' def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def save(saver, sess): if not os.path.exists('./saved_model/' + env_name): os.mkdir('./saved_model/' + env_name) ckpt_path = saver.save(sess, save_path) #print('Save ckpt file: {}'.format(ckpt_path)) def load(saver, sess): if os.path.exists('./saved_model/' + env_name): saver.restore(sess, save_path) print('Load model complete.') else: print('There is no saved model.') if test is False: start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): #logger.save_state({'env': env}, None) save(saver, sess) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() #save(saver, sess) else: load(saver, sess) test_logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 num_episodes = 100 render = True max_ep_len = 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o, 0) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): test_logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 test_logger.log_tabular('EpRet', with_min_and_max=True) test_logger.log_tabular('EpLen', average_only=True) test_logger.dump_tabular()
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.num_worker = 20 self.noise = OU_noise(self.output_size, self.num_worker) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \ cr.sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) with tf.variable_scope('target'): _, _, _, _, _, _, _, self.v_targ = \ cr.sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) self.q_backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.v_targ) self.v_backup = tf.stop_gradient(self.min_q_pi - self.alpha * self.logp_pi) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi) self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2) self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2) self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1, self.q2, self.v, self.logp_pi, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def __init__(self, state_dim, act_dim, hidden_sizes, gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2): """ Placeholders initialization """ self.state_t_ph = tf.placeholder(shape=(None, state_dim), dtype=tf.float32, name="State_t_ph") self.state_t1_ph = tf.placeholder(shape=(None, state_dim), dtype=tf.float32, name="State_t1_ph") self.act_ph = tf.placeholder(shape=(None, act_dim), dtype=tf.float32, name="Actions_ph") self.ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name="Rewards_ph") self.done_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name="Done_ph") self.gamma = gamma self.alpha = alpha self.lr = lr self.polyak = polyak with tf.variable_scope("main"): self.mean, self.pi, self.logp_pi, self.q1, \ self.q2, self.q1_pi, self.q2_pi, self.v = core.mlp_actor_critic(self.state_t_ph, self.act_ph, hidden_sizes) with tf.variable_scope("target"): _, _, _, _, _, _, _, self.v_targ = core.mlp_actor_critic(self.state_t1_ph, self.act_ph, hidden_sizes) # Min Double-Q: self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) # Targets for Q and V regression self.q_backup = tf.stop_gradient(self.ret_ph + self.gamma * (1 - self.done_ph) * self.v_targ) self.v_backup = tf.stop_gradient(self.min_q_pi - self.alpha * self.logp_pi) # Soft actor-critic losses self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.min_q_pi, name="pi_loss") self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1) ** 2, name="q1_loss") self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2) ** 2, name="q2_loss") self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v) ** 2, name="v_loss") self.value_loss = self.q1_loss + self.q2_loss + self.v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) self.pi_optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi'), name="train_pi_op") # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) self.value_optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize(self.value_loss, var_list=self.value_params, name="train_value_op") # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([tf.assign(self.v_targ, self.polyak * self.v_targ + (1 - self.polyak) * self.v_main) for self.v_main, self.v_targ in zip(get_vars('main'), get_vars('target'))], name="target_update") """ alpha tuning""" self.target_entropy = -4 self.log_alpha = tf.compat.v1.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) self.alpha = tf.exp(self.log_alpha, name="alpha") self.alpha_loss = -tf.reduce_mean(self.log_alpha * tf.stop_gradient(self.logp_pi + self.target_entropy), name="alpha_loss") self.alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.lr, name='alpha_optimizer') self.alpha_train_op = self.alpha_optimizer.minimize(loss=self.alpha_loss, var_list=[self.log_alpha], name="alpha_train_op") """-------------""" # All ops to call during one training step self.step_ops = [self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1, self.q2, self.v, self.logp_pi, self.train_pi_op, self.train_value_op, self.target_update, self.alpha_loss, self.alpha_train_op] # Initializing targets to match main variables self.target_init = tf.group([tf.assign(self.v_targ, self.v_main) for self.v_main, self.v_targ in zip(get_vars('main'), get_vars('target'))]) """For saving purposes""" tf.identity(self.pi, name="pi") tf.identity(self.mean, name="mean") tf.identity(self.q1, name="q1") tf.identity(self.q2, name="q2") tf.identity(self.v, name="v") tf.identity(self.logp_pi, name="logp_pi") """ Tensorboard summaries """ self.summary_reward_ph = tf.placeholder(tf.float32, name="reward_scalar_ph") self.summary_reward = tf.summary.scalar("Learning performance", self.summary_reward_ph) """ For saving trained model """ self.saver = tf.train.Saver() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) self.file_writer = tf.summary.FileWriter('/home/tomas/catkin_ws/src/kuka_push/scripts/logs', self.sess.graph)
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, n_episodes=10000, replay_size=int(1e6), gamma=0.99, show_steps=50, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=200, logger_kwargs=dict(), save_freq=1): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=5): for j in range(n): o, r, d, ep_ret, ep_len, ep_cost = test_env.reset( ), 0, False, 0, 0, 0 while not (d or (ep_len == 5 * max_ep_len)): # Take deterministic actions at test time (noise_scale=0) test_env.render() a = get_action(o, 0) o, r, d, _, c = test_env.step(a + 0.5 * np.random.rand(), 1) ep_ret += (r - c) ep_len += 1 ep_cost += c test_env.close() print( "\n avg reward {} and episode length {} over {} trials, cost/step {}" .format(ep_ret / n, ep_len / n, n, ep_cost / ep_len)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for t in range(start_steps): a = env.action_space.sample() o2, r, d, _, c = env.step(a, 1) r -= c replay_buffer.store(o, a, r, o2, d) o = o2 if d: o = env.reset() fails = 0 # Main loop: collect experience in env and update/log each epoch for t in itertools.count(): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ a = get_action(o, act_noise) # Step the env o2, r, d, _, c = env.step(a, 1) r -= c ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if t == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 print("\rSteps {:3}, fails {}".format(t, fails), end="") if t % max_ep_len == 0: """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(max_ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) if d: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 fails += 1 # End of epoch wrap-up if t > 0 and t % (show_steps * max_ep_len) == 0: # Test the performance of the deterministic version of the agent. test_agent()
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, control_policy=ControlPolicy, n_episodes=10000, replay_size=int(1e6), gamma=0.99, show_steps=50, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=200, logger_kwargs=dict(), save_freq=1): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ctrl_pol = control_policy(env) # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=5): tot_len, tot_ret = 0, 0 cost, cost_ctrl = 0, 0 for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 o_ctrl = np.array(o) while not (d or (ep_len == 5 * max_ep_len)): # Take deterministic actions at test time (noise_scale=0) test_env.render() a_ctrl = np.array([ctrl_pol.predict(o_ctrl)]) o_ctrl, _, _, info = test_env.step(a_ctrl, 0) cost_ctrl += info["cost"] a = get_action(o, 0) o, r, d, info = test_env.step(a, 1) cost += info["cost"] ep_len += 1 tot_len += ep_len test_env.close() print( "\n avg reward {:.5} and episode length {} over {} trials, cost/step rl/lqr {:.5}/{:.5}" .format((tot_len - cost) / n, tot_len / n, n, cost / tot_len, cost_ctrl / tot_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_ctrl = np.array(o) #env.state[0] for t in range(start_steps): #a = env.action_space.sample() a = np.array([ctrl_pol.predict(o)]) o2, r, d, info = env.step(a, 1) r -= info["cost"] replay_buffer.store(o, a, r, o2, d) o = o2 if d: o = env.reset() fails = 0 takeover = False cost, cost_ctrl = 0, 0 retrain_steps = 0 show = False # Setup plotting # times = [] # plt.ion() # fig, ax = plt.subplots() # plot = ax.plot([], []) # costs = [] # plot_ctrl = ax.plot([], []) # ctrl_costs = [] # ax.legend(["ddpg cost", "lqr cost"]) # ax.set_xlabel("time") # ax.set_ylabel("cost") # Main loop: collect experience in env and update/log each epoch for t in itertools.count(): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if show > 0: env.render(takeover=takeover) # Step lqr a_ctrl = np.array([ctrl_pol.predict(o_ctrl)]) o_ctrl, _, _, info = env.step(a_ctrl, 0) cost_ctrl += info["cost"] # Step ddpg scaler = min(1, 0.1 + t / 100000) takeover = np.abs(o[2]) > 0.5 * scaler or np.abs(o[0]) > 0.7 * scaler # takeover = False if takeover: a = np.array([ctrl_pol.predict(o)]) else: a = get_action(o, act_noise) o2, r, d, info = env.step(a, 1) cost += info["cost"] r -= info["cost"] retrain_steps += 1 ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if t==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 print( "\rSteps {:5}, fails {:3}, ep_len {:5}, disturbance {:7.3}, cost rl/lqr {:7.3}/{:7.3}" .format(t, fails, ep_len, info["disturbance"] if info["push"] else 0.0, cost / retrain_steps, cost_ctrl / retrain_steps), end="") if np.random.rand() * max_ep_len < 1: """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(max_ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) # cost /= retrain_steps # cost_ctrl /= retrain_steps # costs.append(cost) # ctrl_costs.append(cost_ctrl) # times.append(0.02 * (t + start_steps)) # ax.plot(times, costs, 'r-', times, ctrl_costs, 'b--') # fig.canvas.draw() # plt.pause(0.005) cost = 0 cost_ctrl = 0 retrain_steps = 0 show -= 1 env.state[0] = np.array(env.state[1]) o_ctrl = env.state[0] print() if d: o, r, d, ep_len = env.reset(), 0, False, 0 o_ctrl = np.array(o) fails += 1 # End of epoch wrap-up if t > 0 and t % (show_steps * max_ep_len) == 0: # Test the performance of the deterministic version of the agent. test_agent() show = 5
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 8 self.target_update_tau = 0.995 self.gamma = 0.99 self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = 1e-4 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = cr.dipg_sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, tau= self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size ) with tf.variable_scope('target'): _, _, _, _, _, _, self.v_targ = cr.dipg_sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, tau=self.tau_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q = tf.where(tf.less(tf.reduce_mean(self.q1_pi),tf.reduce_mean(self.q2_pi)),self.q1_pi,self.q2_pi) self.q_backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\ + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.v_targ) self.v_backup = tf.stop_gradient(self.min_q\ - self.alpha*tf.tile(tf.expand_dims(self.logp_pi,axis=1),[1,self.support_size])) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - tf.reduce_mean(self.q1_pi*tf.square(self.tau_ph))) tau = self.tau_ph inv_tau = 1 - tau tau = tf.tile(tf.expand_dims(tau, axis=1), [1, self.support_size, 1]) inv_tau = tf.tile(tf.expand_dims(inv_tau, axis=1), [1, self.support_size, 1]) logit_valid_tile = tf.tile(tf.expand_dims(self.q_backup, axis=1), [1, self.support_size, 1]) theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.q1_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.q2_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.v, axis=2), [1, 1, self.support_size]) logit_valid_tile = tf.tile(tf.expand_dims(self.v_backup, axis=1), [1, self.support_size, 1]) Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE) error_loss = logit_valid_tile - theta_loss_tile Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss) self.v_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1)) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize(self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([tf.assign(v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))]) self.step_ops = [self.pi_loss, self.value_loss, self.train_pi_op, self.train_value_op, self.target_update] self.target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(cr.get_vars('main/v'), cr.get_vars('target/v'))]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim, z_type, act_noise, test_without_state, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, z_dim, obs_dim, None, None) actor_critic = core.get_iac_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n' % var_counts) # Bellman backup for Q and V function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) min_q_pi = tf.minimum(q1_pi, q2_pi) v_backup = tf.stop_gradient(min_q_pi) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2) q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2) v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2) value_loss = q1_loss + q2_loss + v_loss # Separate train ops for pi, q policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_policy_op = policy_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) if ac_kwargs["pi_separate"]: train_policy_emb_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/emb')) train_policy_d_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/d')) train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q') + get_vars('main/v')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def sample_z(size): if z_type == "uniform": return np.random.random_sample(size=size) elif z_type == "gaussian": return np.random.normal(size=size) else: raise Exception("z_type error") def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={ x_ph: o.reshape(1, -1), z_ph: sample_z((1, z_dim)) })[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) if test_without_state: _, real_a = get_action(np.zeros(o.shape), 0) else: _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } feed_dict[z_ph] = sample_z((batch_size, z_dim)) # Policy Learning update for key in feed_dict: feed_dict[key] = np.repeat(feed_dict[key], q_pi_sample_size, axis=0) feed_dict[z_ph] = sample_z( (batch_size * q_pi_sample_size, z_dim)) if ac_kwargs["pi_separate"]: if len(rewards) % 2 == 0: outs = sess.run([pi_loss, train_policy_emb_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_d_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_op], feed_dict) logger.store(LossPi=outs[0]) # Q-learning update outs = sess.run([q1_loss, v_loss, q1, v, train_value_op], feed_dict) logger.store(LossQ=outs[0], LossV=outs[1], ValueQ=outs[2], ValueV=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True)[0] test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('ValueQ', average_only=True) logger.log_tabular('ValueV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep sess.run(target_update, feed_dict) logger.save_state( { "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph = core.placeholders( opt.obs_shape, opt.act_shape, opt.obs_shape) self.r_ph, self.d_ph, self.logp_pi_ph = core.placeholders( (opt.Ln, ), (opt.Ln, ), (opt.Ln, )) # ------ if opt.alpha == 'auto': log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha_v = tf.exp(log_alpha) else: alpha_v = opt.alpha # ------ # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, self.logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \ = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_, q1_mu_, q2_mu_ \ = actor_critic(self.x2_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) # ------ if isinstance(alpha_v, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + opt.target_entropy)) alpha_optimizer = tf.train.AdamOptimizer( learning_rate=opt.lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # ------ # Min Double-Q: if opt.use_max: min_q_pi = tf.minimum(q1_mu_, q2_mu_) else: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # x2 # get rid of abnormal explosion # min_q_pi = tf.clip_by_value(min_q_pi, -300.0, 900.0) #### n-step backup q_backup = tf.stop_gradient(min_q_pi) for step_i in reversed(range(opt.Ln)): q_backup = self.r_ph[:, step_i] + \ opt.gamma * (1 - self.d_ph[:, step_i]) * (-alpha_v * self.logp_pi_ph[:, step_i] + q_backup) #### # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) self.value_loss = q1_loss + q2_loss value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') bn_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(bn_update_ops): train_value_op = value_optimizer.minimize( self.value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha_v, Number): self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha_v), train_value_op, target_update ] else: self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, alpha_v, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session(config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.value_loss, self.sess)
def maxsqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(5e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=200, start_steps=1000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, prob_pi_a_ph, d_ph = core.placeholders_from_space( obs_space, act_space, obs_space, None, None, None) ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.4 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, q1, q2, v1_x2, v2_x2, pi_log, pi_log_x2 = actor_critic( x_ph, x2_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): mu_, pi_, q1_, q2_, v1_x2_, v2_x2_, pi_log_, pi_log_x2_ = actor_critic( x_ph, x2_ph, a_ph, alpha, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### logp_pi_ = tf.reduce_sum(tf.exp(pi_log_) * pi_log_, axis=1) if isinstance(alpha, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: # # scheme 111111 # min_q_pi = tf.minimum(v1_x2_, v2_x2_) # # min_q_pi = tf.minimum(q1_pi_, q2_pi_) # # min_q_pi = tf.minimum(q1_mu_, q2_mu_) # v_backup = min_q_pi - alpha * pi_log_x2 # v_backup = tf.reduce_sum(tf.exp(pi_log_x2)*v_backup, axis=1) # scheme 222222 min_q_pi = tf.minimum(tf.reduce_sum(tf.exp(pi_log_x2) * v1_x2_, axis=1), tf.reduce_sum(tf.exp(pi_log_x2) * v2_x2_, axis=1)) v_backup = min_q_pi - alpha * tf.reduce_sum(tf.exp(pi_log_x2) * pi_log_x2, axis=1) # # scheme 333333 # min_q_pi = tf.minimum(v1_x2_, v2_x2_) # v_backup = min_q_pi - alpha * pi_log_x2 # v_backup = tf.reduce_max(v_backup, axis=1) v_backup = tf.stop_gradient(v_backup) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses a_one_hot = tf.one_hot(a_ph[..., 0], depth=act_dim) prob_pi_a_cur = tf.reduce_sum(tf.exp(pi_log) * a_one_hot, axis=1) pi_ratio = tf.stop_gradient( tf.clip_by_value(prob_pi_a_cur / prob_pi_a_ph, 0.2, 1.2)) # 0.2, 1.2 # pi_ratio = 1.0 q1_loss = 0.5 * tf.reduce_mean(pi_ratio * (q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean(pi_ratio * (q_backup - q2)**2) value_loss = q1_loss + q2_loss # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha), train_value_op, target_update ] else: step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, alpha, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def get_pi_log(o): return sess.run(pi_log, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] # def get_logp_pi(o): # return sess.run(logp_pi, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def test_agent(n=20): # n: number of tests global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret = 0.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) # o2, r, _, d = env.step(a) ##################### # d = d['ale.lives'] < 5 ##################### ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # # scheme 1 # # logp_pi # logp_pi2 = get_logp_pi(o2) # r_pi = r + gamma * (1 - d) * (- alpha * logp_pi2) # # Store experience to replay buffer # replay_buffer.store(o, a, r_pi, o2, d) # scheme 2 prob_pi_a = np.exp(get_pi_log(o))[a] replay_buffer.store(o, a, prob_pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # make sure: max_ep_len < steps_per_epoch ep_index += 1 print('episode: {}, ep_len: {}, reward: {}'.format( ep_index, ep_len, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(int(ep_len)): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], prob_pi_a_ph: batch['prob_pi_a'], d_ph: batch['done'], } # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], Q1Vals=outs[2], Q2Vals=outs[3], LogPi=outs[4], Alpha=outs[5]) #if d: logger.store(EpRet=ep_ret, EpLen=ep_len) # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(10) # if logger.get_stats('TestEpRet')[0] >= 190: # print('Recalculating TestEpRet...') # test_agent(100) # test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.target_noise = 0.2 self.noise_clip = 0.5 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi, _ = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, ) with tf.variable_scope('target'): self.pi_targ, self.q1_double_targ, self.q2_double_targ, self.q1_pi_targ, self.q2_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, pi_q_noise=self.target_noise, noise_clip=self.noise_clip) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ) #self.min_q_targ = tf.minimum(self.q1_double_targ,self.q2_double_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.value_ops = [self.v_loss, self.train_value_op] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) self.saver = tf.train.Saver()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_name='Ant-v2', actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ tf.set_random_seed(seed) np.random.seed(seed) env = gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs tf.summary.FileWriter('./log/', graph=tf.get_default_graph()) replay_buffer = ReplayBuffer(obs_dim=env.observation_space.shape[0], act_dim=env.action_space.shape[0], size=replay_size) episode = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o)[0] else: a = np.clip(env.action_space.sample(), -1, 1) # Step the env env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ episode += 1 for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) print("episode %d, reward %d" % (episode, ep_ret)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 sess.close()
def asac_v2(actor_critic=core.mlp_actor_critic, seed=0, ac_kwargs=dict(), steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=0.001, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() tf.set_random_seed(seed) np.random.seed(seed) env = baxter() obs_dim = env.obs_dim act_dim = env.act_dim # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = 0.1 # Share information about action space with policy architecture # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in [ 'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main' ]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R dQ = Q_backup * (R - Q) pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars( 'main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss, v_targ ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch obs1_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) obs2_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) act_epi = np.zeros([2 * max_ep_len, act_dim], dtype=np.float32) rew_epi = np.zeros([2 * max_ep_len], dtype=np.float32) done_epi = np.zeros([2 * max_ep_len], dtype=np.float32) ptr_epi = 0 alpha_update = False epi_num = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o["feature"]) else: a = 0.1 - np.random.sample(act_dim) * 0.2 # Step the env o2, r = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o["feature"], a, r, o2["feature"], d) obs1_epi[ptr_epi] = o["feature"] obs2_epi[ptr_epi] = o2["feature"] act_epi[ptr_epi] = a rew_epi[ptr_epi] = r done_epi[ptr_epi] = d ptr_epi += 1 # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): epi_num += 1 print("epi : {}, alpha : {}, return : {}".format( epi_num, alpha_t, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ """ rew_epi[ptr_epi] = sess.run(R, feed_dict={x_ph: [o]})[0] rets_epi = scipy.signal.lfilter([1], [1, float(-gamma)], rew_epi[::-1], axis=0)[::-1] rets_epi = rets_epi[:-1] """ """ v_epi = sess.run(R, feed_dict={x_ph: obs_epi}) q_epi, adv_epi = sess.run([Q, adv], feed_dict={x_ph: obs_epi[:-1], a_ph: act_epi}) rets_epi = rew_epi + gamma*v_epi[1:] if t > start_steps: alpha.update_alpha(adv_epi, np.mean(rets_epi*(v_epi[:-1]-q_epi)) > 0) alpha_t = alpha() print("{} {}".format(np.mean(rets_epi*(v_epi[:-1]-q_epi)), alpha_t)) """ if ptr_epi >= max_ep_len: feed_dict = { x_ph: obs1_epi[:ptr_epi], x2_ph: obs2_epi[:ptr_epi], a_ph: act_epi[:ptr_epi], r_ph: rew_epi[:ptr_epi], d_ph: done_epi[:ptr_epi] } adv_epi, Q_epi, R_epi = sess.run([adv, Q, R], feed_dict) R_next_epi = sess.run(R, feed_dict={x_ph: obs2_epi[:ptr_epi]}) dQ_epi = (rew_epi[:ptr_epi] + gamma * (1 - done_epi[:ptr_epi]) * R_next_epi) * (R_epi - Q_epi) """ ret_epi = np.zeros([ptr_epi], dtype=np.float32) for i in np.arange(ptr_epi)[::-1]: if i == ptr_epi - 1: R_next_epi = sess.run(R, feed_dict={x_ph: [obs2_epi[i]]})[0] ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*R_next_epi else: ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*ret_epi[i+1] dQ_epi = ret_epi * (R_epi - Q_epi) """ if t > start_steps: alpha.update_alpha(adv_epi, np.mean(dQ_epi) > 0) alpha_t = alpha() print("{} {}".format(np.mean(dQ_epi), alpha_t)) obs1_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) obs2_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) act_epi = np.zeros([max_ep_len * 2, act_dim], dtype=np.float32) rew_epi = np.zeros([max_ep_len * 2], dtype=np.float32) done_epi = np.zeros([max_ep_len * 2], dtype=np.float32) ptr_epi = 0 """ batch = replay_buffer.sample_batch(1000) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t} dQ_epi = sess.run(dQ, feed_dict) """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def s2vg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, model_lr=3e-4, value_lr=1e-3, pi_lr=3e-4, alpha=0.4, batch_size=100, start_steps=1000,max_ep_len=1000, save_freq=1, train_model_epoch=1, test_freq=10, exp_name='',env_name='',save_epoch=100): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) transition , r_rm, transition_pi ,r_rm_pi, v_prime = core.reward_dynamic_model(x_ph, a_ph, pi, **ac_kwargs) # Target value network for updates with tf.variable_scope('target'): _, _, _, _, _, _, _,v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # TD3 style Q function updates min_q_pi = tf.minimum(q1_pi, q2_pi) q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) r_backup = r_ph transition_backup = x2_ph r_loss = 0.5 * tf.reduce_mean((r_backup-r_rm)**2) transition_loss = 0.5 * tf.reduce_mean((transition_backup - transition)**2) model_loss = r_loss+transition_loss q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss pi_loss = r_rm_pi - alpha*logp_pi + gamma*(1-d_ph)*v_prime # model train op model_optimizer = tf.train.AdamOptimizer(learning_rate=model_lr) model_params = get_vars('main/dm') + get_vars('main/rm') train_model_op = model_optimizer.minimize(model_loss, var_list=model_params) # policy train op pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) with tf.control_dependencies([train_model_op]): train_pi_op = pi_optimizer.minimize(-pi_loss, var_list=get_vars('main/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=value_lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) saver = tf.compat.v1.train.Saver() sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(epoch,n=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi total_reward = 0 for j in range(n): # repeat n times o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 total_reward += ep_ret print('The '+str(epoch)+' epoch is finished!') print('The test reward is '+str(total_reward/n)) return total_reward/n o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs reward_recorder = [] for t in range(total_steps): """ The algorithm would take total_steps totally in the training """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len==max_ep_len else d replay_buffer.store(o, a, r, o2, d) o = o2 if t // steps_per_epoch > train_model_epoch: # train 5 steps of Q, V, and pi. # train 1 step of model for j in range(5): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done']} _ = sess.run(step_ops, feed_dict) outs = sess.run(train_model_op, feed_dict) else: # pretrain the model batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(train_model_op, feed_dict) if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch > train_model_epoch and epoch % test_freq == 0: # test the agent when we reach the test_freq, save the experiment result reward_test = test_agent(epoch) reward_recorder.append(reward_test) reward_nparray = np.asarray(reward_recorder) np.save(str(exp_name)+'_'+str(env_name)+'_'+str(save_freq)+'.npy',reward_nparray) if epoch % save_epoch == 0: # save the model saver.save(sess, str(exp_name)+'_'+str(env_name),global_step=epoch)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 64 self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = env_set['batch_size'] self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.kappa = 1.0 self.risk_factor = -1.0 self.random_risk = False self.target_noise = 0.2 self.noise_clip = 0.5 tf.set_random_seed(10) self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size,self.state_size, None, None) self.risk_factor_ph = tf.placeholder(tf.float32) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi, self.q2_pi = cr.dqpg_td3_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) with tf.variable_scope('target'): _, _, _, self.q1_pi_targ, self.q2_pi_targ = cr.dqpg_td3_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size, pi_q_noise=self.target_noise, noise_clip=self.noise_clip) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ) self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\ + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.min_q_targ) self.quantile_weight = 1.0 - self.risk_factor_ph*\ (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0) self.pi_loss = -tf.reduce_mean( tf.reduce_mean(self.q1_pi * self.quantile_weight)) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) tau = tf.reshape( tf.range(0.5 / self.support_size, 1, 1 / self.support_size), [1, self.support_size]) tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size]) theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size]) #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Logcosh = bellman_errors + tf.math.softplus( -2. * bellman_errors) - tf.log(2.) Loss = tf.abs(tau - tf.stop_gradient(tf.to_float( bellman_errors < 0))) * Logcosh self.v1_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1)) theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size]) #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Logcosh = bellman_errors + tf.math.softplus( -2. * bellman_errors) - tf.log(2.) Loss = tf.abs(tau - tf.stop_gradient(tf.to_float( bellman_errors < 0))) * Logcosh self.v2_loss = tf.reduce_mean( tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1)) self.v_loss = self.v1_loss + self.v2_loss self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.value_ops = [self.v_loss, self.train_value_op] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) print( self.sess.run(self.quantile_weight, feed_dict={self.risk_factor_ph: self.risk_factor})) self.saver = tf.train.Saver()
def __init__(self, env_fn, reward_fn=[], actor_critic=core.mlp_actor_critic, xid=0, seed=0, max_ep_len=1000, gamma=.99, alpha=0.2, lr=1e-3, polyak=0.995, replay_size=int(1e6), ac_kwargs=dict(), logger_kwargs=dict(), normalization_factors=[], learn_reduced=False): tf.set_random_seed(seed) np.random.seed(seed) self.xid = xid self.main_scope = 'main' + str(xid) self.target_scope = 'target' + str(xid) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(logger_kwargs) self.max_ep_len = max_ep_len self.reward_fn = reward_fn self.normalization_factors = normalization_factors self.learn_reduced = learn_reduced self.env, self.test_env = env_fn(), env_fn() self.obs_dim = len(self.env.env.state_vector()) if self.learn_reduced: self.obs_dim = ant_utils.expected_state_dim self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = self.env.action_space self.graph = tf.Graph() with self.graph.as_default(): # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders( self.obs_dim, self.act_dim, self.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope(self.main_scope): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v, self.std = actor_critic( self.x_ph, self.a_ph, **ac_kwargs) # Target value network with tf.variable_scope(self.target_scope): _, _, _, _, _, _, _, self.v_targ, _ = actor_critic( self.x2_ph, self.a_ph, **ac_kwargs) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Min Double-Q: min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * self.v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * self.logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * self.logp_pi - self.q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - self.q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - self.q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - self.v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars( self.var_scope('pi'))) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars(self.var_scope('q')) + get_vars( self.var_scope('v')) with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize( value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars(self.main_scope), get_vars(self.target_scope)) ]) # All ops to call during one training step self.step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, self.q1, self.q2, self.v, self.logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(self.v_targ, v_main) for v_main, self.v_targ in zip( get_vars(self.main_scope), get_vars(self.target_scope)) ]) self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init)