def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = get_vars('model/values_fn/') # Q Values and policy target params source_params = get_vars("model/") target_params = get_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) self.buffer_is_prioritized = self.buffer_type.__name__ in [ "PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer" ] if self.replay_buffer is None: if self.buffer_is_prioritized: if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0: self.replay_buffer = ReplayBuffer(self.buffer_size) else: buffer_kw = { "size": self.buffer_size, "alpha": 0.7 } if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update({ "learning_starts": self.prioritization_starts, "batch_size": self.batch_size }) self.replay_buffer = self.buffer_type(**buffer_kw) else: self.replay_buffer = self.buffer_type(self.buffer_size) #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) self.policy_test = policy_test = self.policy_tf.make_actor( self.processed_obs_ph, scope="pi_t") # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) qf1_pi_t, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_test, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss if self.buffer_is_prioritized: self.is_weights_ph = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights") qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1)**2) qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2)**2) else: qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) q_discrepancy = tf.abs(qf1_pi - qf2_pi) self.q_disc_strength_ph = tf.placeholder( tf.float32, [], name="q_disc_strength_ph") self.q_disc_strength_schedule = ExponentialSchedule( int(1e5), 30, 0, rate=10) qvalues_losses = qf1_loss + qf2_loss rew_loss = tf.reduce_mean(qf1_pi) q_disc_loss = tf.reduce_mean( q_discrepancy ) #self.q_disc_strength_ph * tf.reduce_mean(q_discrepancy) action_loss = self.action_l2_scale * tf.nn.l2_loss( self.policy_out) # Policy loss: maximise q value self.policy_loss = policy_loss = -( rew_loss + q_disc_loss) + action_loss self.policy_loss_t = policy_loss_t = -tf.reduce_mean( qf1_pi_t) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=get_vars('model/pi')) self.policy_train_op = policy_train_op policy_optimizer_t = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op_t = policy_optimizer_t.minimize( policy_loss_t, var_list=get_vars('model/pi_t')) self.policy_train_op_t = policy_train_op_t # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = get_vars('model/values_fn/') # Q Values and policy target params source_params = get_vars("model/") target_params = get_vars("target/") source_params = [ param for param in source_params if "pi_t" not in param.name ] # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op, q_discrepancy ] # Monitor losses and entropy in tensorboard tf.summary.scalar("rew_loss", rew_loss) tf.summary.scalar("q_disc_loss", q_disc_loss) tf.summary.scalar("action_loss", action_loss) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar("policy_loss_t", policy_loss_t) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()