class MLPModel(Model): def __init__(self, ob_space, ac_space, ob_filter=True, gaussian_fixed_var=True): self.ob_filter = ob_filter self.gaussian_fixed_var = gaussian_fixed_var super(MLPModel, self).__init__(ob_space, ac_space) def _create_network(self): x = self.ob # create ob filter if self.ob_filter: self.ob_rms = RunningMeanStd(shape=self.ob_space.shape) x = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # actor l = x l = tf.nn.tanh( U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0))) action_layer = l # critic l = x l = tf.nn.tanh( U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0))) value_layer = l self._create_logit_value(action_layer, value_layer, self.gaussian_fixed_var) def update_ob_norm(self, ob): if not hasattr(self, 'ob_rms'): return self.ob_rms.update(ob)
class CnnPolicy(object): recurrent = False def __init__(self, name, ob_space, ac_space, hid_size, num_hid_layers, kind='large'): with tf.variable_scope(name): self._init(ob_space, ac_space, hid_size, num_hid_layers, kind) self.scope = tf.get_variable_scope().name self.recurrent = False def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind): assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape)) self.ob = [ob_p, ob_f] #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape = ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 x = self.img_encoder(x, kind) ob_last = tf.concat((obpz, x), axis=-1) with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred_ext = tf.layers.dense(last_out, 1, name='vf_ext', kernel_initializer=U.normc_initializer(1.0))[:,0] self.vpred_int = tf.layers.dense(last_out, 1, name='vf_int', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred_ext, self.vpred_int]) def img_encoder(self, x, kind): if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError return x def act(self, stochastic, ob): ob1, ob2 = ob ob2 = np.array(ob2) ac1, vpred_ext, vpred_int = self._act(stochastic, ob1, ob2) norm_ac1 = np.tanh(ac1) return norm_ac1[0], ac1[0], vpred_ext[0], vpred_int[0] def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_initial_state(self): return [] def update_obs_rms(self, ob): obp = np.array(zip(*ob.tolist())[0]) self.ob_rms.update(obp)
class EnsembleDDPG(object): def __init__(self, actor, critics, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.005, normalize_returns=False, enable_popart=False, normalize_observations=False, batch_size=100, observation_range=(-np.inf, np.inf), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, action_noise_scale=0.2, action_noise_clip=0.5, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., use_mpi_adam=False): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range # settings self.use_mpi_adam = use_mpi_adam # set the list of critic self.critics = critics self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # remember the noise scale and clip self.action_noise_scale = action_noise_scale self.action_noise_clip = action_noise_clip # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor # set up different target critics, primary and supplementary target_critics = [] for critic in critics: target_critic = copy(critic) target_critic.name = 'target_' + critic.name target_critics.append(target_critic) self.target_critics = target_critics # Create networks and core TF parts that are shared across setup parts. # actor_tf pi(s) is built from the actor and normalized_obs0 self.actor_tf = actor(normalized_obs0) # normalized_critic_tf normalized Q(s,a) is built from the observation and action self.normalized_critic_tfs = [critic(normalized_obs0, self.actions) for critic in critics] self.normalized_critic_tf_main = self.normalized_critic_tfs[0] # critic_tf Q(s,a) is built from de-normalization and clipping from normalized Q(s,a) self.critic_tfs = [ denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) for normalized_critic_tf in self.normalized_critic_tfs] self.critic_tf_mean = 0 for critic_tf in self.critic_tfs: self.critic_tf_mean += critic_tf self.critic_tf_mean *= 1.0 / len(self.critic_tfs) # normalized_critic_with_actor_tf normalized Q(s,pi(s)) is built from the observation, # and action provided by actor self.normalized_critic_with_actor_tfs = [ critic(normalized_obs0, self.actor_tf, reuse=True) for critic in critics ] # critic_with_actor_tf is built from de-normalization and clipping from normalized Q(s,pi(s)) self.critic_with_actor_tfs = [denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) for normalized_critic_with_actor_tf in self.normalized_critic_with_actor_tfs ] self.critic_with_actor_tf_main = self.critic_with_actor_tfs[0] self.critic_with_actor_tf_mean = 0 for critic_with_actor_tf in self.critic_with_actor_tfs: self.critic_with_actor_tf_mean += critic_with_actor_tf self.critic_with_actor_tf_mean *= 1.0 / len(self.critics) # Q_obs1 Q(s',pi'(s)) is built from next state s'(observation), target actor pi', # and de-normalization target_action = target_actor(normalized_obs1) self.target_Q_vals = [] self.target_Q_val_mean = 0 # add noise in target critic functions for target_critic in target_critics: target_action_noise = tf.clip_by_value(tf.random_normal( tf.shape(target_action), mean=0.0, stddev=action_noise_scale, dtype=tf.float32), clip_value_min=-action_noise_clip, clip_value_max=action_noise_clip) noisy_target_action = tf.clip_by_value(target_action + target_action_noise, clip_value_min=action_range[0], clip_value_max=action_range[1]) target_Q_obs = denormalize(target_critic(normalized_obs1, noisy_target_action), self.ret_rms) target_Q_val = self.rewards + (1. - self.terminals1) * gamma * target_Q_obs self.target_Q_vals.append(target_Q_val) self.target_Q_val_mean += target_Q_val self.target_Q_val_mean *= 1.0 / (len(critics)) # merge trainable variables into one set self.target_critic_vars = [] self.critic_vars = [] self.critic_trainable_vars = [] for critic in critics: self.critic_vars += critic.vars self.critic_trainable_vars += critic.trainable_vars for target_critic in target_critics: self.target_critic_vars += target_critic.vars # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) # setup optimizer self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): if self.use_mpi_adam: actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic_vars, self.target_critic_vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] self.target_soft_update_actor = actor_soft_updates self.target_soft_update_critic = critic_soft_updates else: actor_init_updates, actor_soft_updates = get_target_updates(self.actor.trainable_vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic_trainable_vars, self.target_critic_vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] self.target_soft_update_actor = actor_soft_updates self.target_soft_update_critic = critic_soft_updates def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') # Here use the Q(s,pi(s)) as the loss function # use primary critic function to generate policy updates self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf_mean) self.actor_loss_array = [] for critic_with_actor_tf in self.critic_with_actor_tfs: self.actor_loss_array.append(-tf.reduce_mean(critic_with_actor_tf)) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = [] for actor_loss in self.actor_loss_array: self.actor_grads.append(tf.reshape( U.flatgrad(actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm), shape=[-1,1])) self.actor_grad_array = tf.concat(self.actor_grads,axis=1) self.actor_grad_array = tf.reshape(self.actor_grad_array, shape=[-1, len(self.critics)]) self.actor_grad_mean = tf.reduce_mean(self.actor_grad_array, axis=1) self.actor_grad_var = reduce_var(self.actor_grad_array, axis=1) # sum up the gradients self.actor_grad_var_std = tf.sqrt(tf.reduce_sum(self.actor_grad_var)) # print the shape of gradients print("[Tiancheng Shape] Actor Grad Array", self.actor_grad_array.shape) print("[Tiancheng Shape] Actor Grad Mean", self.actor_grad_mean.shape) print("[Tiancheng Shape] Actor Grad Variance", self.actor_grad_var.shape) print("[Tiancheng Shape] Actor Grad VarStd", self.actor_grad_var_std.shape) # add support to single-threaded adam self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) if self.use_mpi_adam: self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) else: self.actor_grads = list( zip(tf.gradients(self.actor_loss, self.actor.trainable_vars), self.actor.trainable_vars)) self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr,beta1=0.9, beta2=0.999, epsilon=1e-08) self.actor_train = self.actor_optimizer.apply_gradients(self.actor_grads) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') # normalize critic target, normalized y ( not sure we need to use different target values here. ) # TODO: abandon static evaluation of critic target values, use dynamic computing method # Use square error between normalized_critic_tf normalized Q(s,a) and normalized critic_target y # ( not use denormalized version ) as loss function, for two different critic, we need to train them both self.critic_loss = 0 # merge the critic loss for all the Q value functions for normalized_critic_tf, critic_target_tf in zip(self.normalized_critic_tfs,self.target_Q_vals): normalized_critic_target_tf = tf.clip_by_value(normalize(tf.stop_gradient(critic_target_tf), self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss += tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf)) # apply l2_regularization on some trainable variables and add them into loss function if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic_trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic_trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic_trainable_vars, clip_norm=self.clip_norm) # un-flatten the gradients for several critics, and compute moment self.critic_grad_array = tf.reshape(self.critic_grads,shape=[-1,len(self.critics)]) self.critic_grad_mean = tf.reduce_mean(self.critic_grad_array,axis=1) self.critic_grad_var = reduce_var(self.critic_grad_array,axis=1) # sum up the gradients self.critic_grad_var_std = tf.sqrt(tf.reduce_sum(self.critic_grad_var)) # print the shape of gradients print("[Tiancheng Shape] Critic Grad Array", self.critic_grad_array.shape) print("[Tiancheng Shape] Critic Grad Mean", self.critic_grad_mean.shape) print("[Tiancheng Shape] Critic Grad Variance", self.critic_grad_var.shape) print("[Tiancheng Shape] Critic Grad VarStd", self.critic_grad_var_std.shape) # add support to single-thread adam if self.use_mpi_adam: self.critic_optimizer = MpiAdam(var_list=self.critic_trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) else: self.critic_grads = list( zip(tf.gradients(self.critic_loss, self.critic_trainable_vars), self.critic_trainable_vars)) self.critic_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr, beta1=0.9, beta2=0.999, epsilon=1e-08) self.critic_train = self.critic_optimizer.apply_gradients(self.critic_grads) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] self.critic_output_vars = [] self.target_output_vars = [] for critic, target_critic in zip(self.critics,self.target_critics): self.critic_output_vars += critic.output_vars self.target_output_vars += target_critic.output_vars for vs in [self.critic_output_vars,self.target_output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] # TODO: compute the variance of values and gradient, for both J and Q ops += [tf.reduce_mean(self.critic_tf_mean)] names += ['MeanQ_mean_over_states'] ops += [reduce_std(self.critic_tf_mean)] names += ['MeanQ_std_over_states'] # print the shape of gradients ops += [self.actor_grad_var_std] names += ['Actor Grad Variance Std'] ops += [tf.norm(self.actor_grad_mean)] names += ['Actor Grad Mean Norm'] # print the shape of gradients ops += [self.critic_grad_var_std] names += ['Critic Grad Variance Std'] ops += [tf.norm(self.critic_grad_mean)] names += ['Critic Grad Mean Norm'] # TODO: outdated stats need to be re-arranged # ops += [tf.reduce_mean(self.critic_tf0)] # names += ['reference_Q0_mean'] # ops += [reduce_std(self.critic_tf0)] # names += ['reference_Q0_std'] # # ops += [tf.reduce_mean(self.critic_tf1)] # names += ['reference_Q1_mean'] # ops += [reduce_std(self.critic_tf1)] # names += ['reference_Q1_std'] # # ops += [tf.reduce_mean(self.critic_with_actor_tf0)] # names += ['reference_actor_Q0_mean'] # ops += [reduce_std(self.critic_with_actor_tf0)] # names += ['reference_actor_Q0_std'] # # ops += [tf.reduce_mean(self.critic_with_actor_tf1)] # names += ['reference_actor_Q1_mean'] # ops += [reduce_std(self.critic_with_actor_tf1)] # names += ['reference_actor_Q1_std'] # # ops += [tf.reduce_mean(self.actor_tf)] # names += ['reference_action_mean'] # ops += [reduce_std(self.actor_tf)] # names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names # compute the action from the observation pi(s) # has an option to compute the q function at the same time def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: # TODO: not sure what to do for this critic_with_actor_tf, set to critic_with_actor_tf0 action, q = self.sess.run([actor_tf, self.critic_with_actor_tf_main], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self, take_update=True,stop_critic_training=False,stop_actor_training=False): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) # if self.normalize_returns and self.enable_popart: # # compute old mean, old std and target Q values # # old mean and std is used for normalization # # and target Q values for # old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # # # compute something # self.ret_rms.update(target_Q.flatten()) # self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ # self.old_std : np.array([old_std]), # self.old_mean : np.array([old_mean]), # }) # # # Run sanity check. Disabled by default since it slows down things considerably. # # print('running sanity check') # # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # # self.obs1: batch['obs1'], # # self.rewards: batch['rewards'], # # self.terminals1: batch['terminals1'].astype('float32'), # # }) # # print(target_Q_new, target_Q, new_mean, new_std) # # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() # else: # # compute target Q value functions ( ( 1 - terminal ) * gamma * Q(s,pi(s)) + r ) # target_Q = self.sess.run([self.target_Q], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # Get all gradients and perform a "synced update". # compute the gradients of actor and critic if self.use_mpi_adam: ops = [self.critic_grads, self.critic_loss] critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) if not stop_critic_training: self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) if take_update: ops = [self.actor_grads, self.actor_loss] actor_grads, actor_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], }) if not stop_actor_training: self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) return critic_loss, actor_loss else: if stop_critic_training: ops = [self.critic_grads, self.critic_grads, self.critic_loss] else: ops = [self.critic_train, self.critic_grads, self.critic_loss] _, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) if take_update: if stop_actor_training: ops = [self.actor_grads, self.actor_grads, self.actor_loss] else: ops = [self.actor_train, self.actor_grads, self.actor_loss] _, actor_grads, actor_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], }) return critic_loss, actor_loss return critic_loss, 0 def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) if self.use_mpi_adam: self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], self.obs1: self.stats_sample['obs1'], self.rewards: self.stats_sample['rewards'], self.terminals1: self.stats_sample['terminals1'].astype('float32'), }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(tf.Module): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet def setup_param_noise(self): assert self.param_noise is not None # Configure perturbed actor. self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) # Configure separate copy for stddev adoption. self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 @tf.function def step(self, obs, apply_noise=True, compute_Q=True): normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs) if self.param_noise is not None and apply_noise: action = self.perturbed_actor(normalized_obs) else: action = actor_tf if compute_Q: normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf) q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() action += noise action = tf.clip_by_value(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): batch = self.memory.sample(batch_size=self.batch_size) obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1']) actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32) normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1) if self.normalize_returns and self.enable_popart: old_mean = self.ret_rms.mean old_std = self.ret_rms.std self.ret_rms.update(target_Q.flatten()) # renormalize Q outputs new_mean = self.ret_rms.mean new_std = self.ret_rms.std for vs in [self.critic.output_vars, self.target_critic.output_vars]: kernel, bias = vs kernel.assign(kernel * old_std / new_std) bias.assign((bias * old_std + old_mean - new_mean) / new_std) actor_grads, actor_loss = self.get_actor_grads(normalized_obs0) critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q) if MPI is not None: self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr) self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr) else: self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables)) self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables)) return critic_loss, actor_loss @tf.function def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1): normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms) target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1 return normalized_obs0, target_Q @tf.function def get_actor_grads(self, normalized_obs0): with tf.GradientTape() as tape: actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_loss = -tf.reduce_mean(critic_with_actor_tf) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) if self.clip_norm: actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads] if MPI is not None: actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0) return actor_grads, actor_loss @tf.function def get_critic_grads(self, normalized_obs0, actions, target_Q): with tf.GradientTape() as tape: normalized_critic_tf = self.critic(normalized_obs0, actions) normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf)) # The first is input layer, which is ignored here. if self.critic_l2_reg > 0.: # Ignore the first input layer. for layer in self.critic.network_builder.layers[1:]: # The original l2_regularizer takes half of sum square. critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel)) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) if self.clip_norm: critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads] if MPI is not None: critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0) return critic_grads, critic_loss def initialize(self): if MPI is not None: sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) @tf.function def update_target_net(self): for var, target_var in zip(self.actor.variables, self.target_actor.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) for var, target_var in zip(self.critic.variables, self.target_critic.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) obs0 = self.stats_sample['obs0'] actions = self.stats_sample['actions'] normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_critic_tf = self.critic(normalized_obs0, actions) critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) stats = {} if self.normalize_returns: stats['ret_rms_mean'] = self.ret_rms.mean stats['ret_rms_std'] = self.ret_rms.std if self.normalize_observations: stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean) stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std) stats['reference_Q_mean'] = tf.reduce_mean(critic_tf) stats['reference_Q_std'] = reduce_std(critic_tf) stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf) stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf) stats['reference_action_mean'] = tf.reduce_mean(actor_tf) stats['reference_action_std'] = reduce_std(actor_tf) if self.param_noise: perturbed_actor_tf = self.perturbed_actor(normalized_obs0) stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf) stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf) stats.update(self.param_noise.get_stats()) return stats def adapt_param_noise(self, obs0): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. mean_distance = self.get_mean_distance(obs0).numpy() if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance @tf.function def get_mean_distance(self, obs0): # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev) normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs0) adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0) mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf))) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
class Model(object): def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6, normalize_observations=True, normalize_returns=False, enable_popart=False, noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1., batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), **network_kwargs): # logger.info('Using agent with the following configuration:') # logger.info(str(self.__dict__.items())) observation_shape = env.observation_space.shape action_shape = env.action_space.shape # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.env = env self.gamma = gamma self.tau = tau self.total_timesteps = total_timesteps self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.enable_popart = enable_popart self.clip_norm = clip_norm self.reward_scale = reward_scale self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.l2_reg_coef = l2_reg_coef self.stats_sample = None self.action_noise = None self.param_noise = None nb_actions = self.env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) self.critic = Critic(network=network, **network_kwargs) self.actor = Actor(nb_actions, network=network, **network_kwargs) # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/' def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.l2_reg_coef > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.l2_reg_coef)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.l2_reg_coef), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def train_step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def step(self, obs, compute_Q=True): feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(self.actor_tf, feed_dict=feed_dict) q = None action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def learn(self, total_timesteps=None, seed=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, render=False, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=64, # per MPI worker param_noise_adaption_interval=50,): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 # eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. self.initialize(sess) sess.graph.finalize() self.reset() obs = self.env.reset() # if eval_env is not None: # eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) # vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 # scalar t = 0 # scalar start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead self.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = self.train_step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: self.env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = self.env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = self.env.step(action) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: self.env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) # the batched data will be unrolled in memory.py's append. self.store_transition(obs, action, r, new_obs, done) obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: self.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = self.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = self.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) self.update_target_net() # # # Evaluate. # eval_episode_rewards = [] # eval_qs = [] # if eval_env is not None: # eval_obs = eval_env.reset() # nenvs_eval = eval_obs.shape[0] # eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) # for t_rollout in range(nb_eval_steps): # eval_action, eval_q, _, _ = self.train_step(eval_obs, apply_noise=False, compute_Q=True) # # eval_obs, eval_r, eval_done, eval_info = eval_env.step( # # max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # # if render_eval: # eval_env.render() # eval_episode_reward += eval_r # # eval_qs.append(eval_q) # for d in range(len(eval_done)): # if eval_done[d]: # eval_episode_rewards.append(eval_episode_reward[d]) # eval_episode_rewards_history.append(eval_episode_reward[d]) # eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # save trainable variables file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time())) model_save_path = self.def_path_pre + file_name self.save(model_save_path) # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. # if eval_env is not None: # combined_stats['eval/return'] = eval_episode_rewards # combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs # combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats_sums = np.array([np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(self.env.get_state(), f) # if eval_env and hasattr(eval_env, 'get_state'): # with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: # pickle.dump(eval_env.get_state(), f) self.sess.graph._unsafe_unfinalize() return self def save(self, save_path=None): save_variables(save_path=save_path, sess=self.sess) print('save model variables to', save_path) def load_newest(self, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x))) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[-1]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path) def load_index(self, index, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[index]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path)
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., td3_variant=False,td3_policy_freq=1,td3_policy_noise=0.0,td3_noise_clip=0.5): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg #추가된 내용 #parameters for using TD3 variant of DDPG #https://arxiv.org/abs/1802.09477 self.td3_variant = td3_variant self.td3_policy_freq = td3_policy_freq self.td3_policy_noise = td3_policy_noise self.td3_noise_clip = td3_noise_clip #노말라이제이션 코드 her에서 가져오자. # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) if self.td3_variant: logger.info('using TD3 variant model') self.normalized_critic_tf, self.normalized_critic_tf2 = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) out_q1, out_q2 = target_critic(normalized_obs1, target_actor(normalized_obs1)) min_q1 = tf.minimum(out_q1,out_q2) Q_obs1 = denormalize(min_q1, self.ret_rms) else: self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.actor_target_soft_updates = actor_soft_updates self.critic_target_soft_updates = critic_soft_updates def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) if self.td3_variant: logger.info('using TD3 variant loss') self.critic_loss = tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf) \ + tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf2) else: self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self,train_iter): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) if self.td3_policy_noise > 0: noise = np.random.normal(loc=0.0,scale=self.td3_policy_noise,size=np.shape(batch['actions'])) noise = np.clip(noise,-self.td3_noise_clip,self.td3_noise_clip) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: np.clip(batch['actions'] + noise,self.action_range[0],self.action_range[1]), self.critic_target: target_Q, }) else: # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) #TD3 has hyperparameter for how frequently to update actor policy and target networks if train_iter % self.td3_policy_freq == 0: self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self, train_iter): # TD3 has hyperparameter for how frequently to update actor policy and target networks if train_iter % self.td3_policy_freq == 0: self.sess.run(self.actor_target_soft_updates) self.sess.run(self.critic_target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class SAC(RLAlgorithm): """Soft Actor-Critic (SAC) References ---------- [1] Tuomas Haarnoja*, Aurick Zhou*, Kristian Hartikainen*, George Tucker, Sehoon Ha, Jie Tan, Vikash Kumar, Henry Zhu, Abhishek Gupta, Pieter Abbeel, and Sergey Levine. Soft Actor-Critic Algorithms and Applications. arXiv preprint arXiv:1812.05905. 2018. """ def __init__( self, training_environment, evaluation_environment, policy, Qs, pool, plotter=None, lr=3e-4, reward_scale=1.0, target_entropy='auto', discount=0.99, tau=5e-3, target_update_interval=1, action_prior='uniform', reparameterize=False, store_extra_policy_info=False, save_full_state=False, **kwargs, ): """ Args: env (`SoftlearningEnv`): Environment used for training. policy: A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. Qs: Q-function approximators. The min of these approximators will be used. Usage of at least two Q-functions improves performance by reducing overestimation bias. pool (`PoolBase`): Replay pool to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. """ super(SAC, self).__init__(**kwargs) self._training_environment = training_environment self._evaluation_environment = evaluation_environment self._policy = policy self._Qs = Qs self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs) self._pool = pool self._plotter = plotter self._policy_lr = lr self._Q_lr = lr self.value_rms = RunningMeanStd(shape=(1,)) self._reward_scale = reward_scale self._target_entropy = ( -np.prod(self._training_environment.action_space.shape) if target_entropy == 'auto' else target_entropy) self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._reparameterize = reparameterize self._store_extra_policy_info = store_extra_policy_info self._save_full_state = save_full_state observation_shape = self._training_environment.active_observation_shape action_shape = self._training_environment.action_space.shape assert len(observation_shape) == 1, observation_shape self._observation_shape = observation_shape assert len(action_shape) == 1, action_shape self._action_shape = action_shape self._build() def _build(self): self._training_ops = {} self._init_global_step() self._init_placeholders() self._init_actor_update() self._init_critic_update() self._init_diagnostics_ops() def _init_placeholders(self): """Create input placeholders for the SAC algorithm. Creates `tf.placeholder`s for: - observation - next observation - action - reward - terminals """ self._iteration_ph = tf.placeholder( tf.int64, shape=None, name='iteration') self._observations_ph = tf.placeholder( tf.float32, shape=(None, *self._observation_shape), name='observation', ) self._next_observations_ph = tf.placeholder( tf.float32, shape=(None, *self._observation_shape), name='next_observation', ) self._actions_ph = tf.placeholder( tf.float32, shape=(None, *self._action_shape), name='actions', ) self._rewards_ph = tf.placeholder( tf.float32, shape=(None, 1), name='rewards', ) self._terminals_ph = tf.placeholder( tf.float32, shape=(None, 1), name='terminals', ) if self._store_extra_policy_info: self._log_pis_ph = tf.placeholder( tf.float32, shape=(None, 1), name='log_pis', ) self._raw_actions_ph = tf.placeholder( tf.float32, shape=(None, *self._action_shape), name='raw_actions', ) def _get_Q_target(self): next_actions = self._policy.actions([self._next_observations_ph]) next_log_pis = self._policy.log_pis( [self._next_observations_ph], next_actions) next_Qs_values = tuple( Q([self._next_observations_ph, next_actions]) * self.value_rms.std + self.value_rms.mean for Q in self._Q_targets) min_next_Q = tf.reduce_min(next_Qs_values, axis=0) next_value = min_next_Q - self._alpha * next_log_pis Q_target = td_target( reward=self._reward_scale * self._rewards_ph, discount=self._discount, next_value=(1 - self._terminals_ph) * next_value) return (Q_target - self.value_rms.mean)/self.value_rms.std, Q_target def _init_critic_update(self): """Create minimization operation for critic Q-function. Creates a `tf.optimizer.minimize` operation for updating critic Q-function with gradient descent, and appends it to `self._training_ops` attribute. See Equations (5, 6) in [1], for further information of the Q-function update rule. """ Q_target, self.raw_q_target = [q[0] for q in tf.split(tf.stop_gradient(self._get_Q_target()), axis=0, num_or_size_splits=2)] assert Q_target.shape.as_list() == [None, 1] Q_values = self._Q_values = tuple( Q([self._observations_ph, self._actions_ph]) for Q in self._Qs) Q_losses = self._Q_losses = tuple( tf.losses.mean_squared_error( labels=Q_target, predictions=Q_value, weights=0.5) for Q_value in Q_values) self._Q_optimizers = tuple( tf.train.AdamOptimizer( learning_rate=self._Q_lr, name='{}_{}_optimizer'.format(Q._name, i) ) for i, Q in enumerate(self._Qs)) Q_training_ops = tuple( Q_optimizer.minimize(loss=Q_loss, var_list=Q.trainable_variables) for i, (Q, Q_loss, Q_optimizer) in enumerate(zip(self._Qs, Q_losses, self._Q_optimizers))) self._training_ops.update({'Q': tf.group(Q_training_ops)}) def _init_actor_update(self): """Create minimization operations for policy and entropy. Creates a `tf.optimizer.minimize` operations for updating policy and entropy with gradient descent, and adds them to `self._training_ops` attribute. See Section 4.2 in [1], for further information of the policy update, and Section 5 in [1] for further information of the entropy update. """ actions = self._policy.actions([self._observations_ph]) log_pis = self._policy.log_pis([self._observations_ph], actions) assert log_pis.shape.as_list() == [None, 1] log_alpha = self._log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) if isinstance(self._target_entropy, Number): alpha_loss = -tf.reduce_mean( log_alpha * tf.stop_gradient(log_pis + self._target_entropy)) self._alpha_optimizer = tf.train.AdamOptimizer( self._policy_lr, name='alpha_optimizer') self._alpha_train_op = self._alpha_optimizer.minimize( loss=alpha_loss, var_list=[log_alpha]) self._training_ops.update({ 'temperature_alpha': self._alpha_train_op }) self._alpha = alpha if self._action_prior == 'normal': policy_prior = tfp.distributions.MultivariateNormalDiag( loc=tf.zeros(self._action_shape), scale_diag=tf.ones(self._action_shape)) policy_prior_log_probs = policy_prior.log_prob(actions) elif self._action_prior == 'uniform': policy_prior_log_probs = 0.0 Q_log_targets = tuple( Q([self._observations_ph, actions]) for Q in self._Qs) min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0) if self._reparameterize: policy_kl_losses = ( alpha * log_pis - min_Q_log_target - policy_prior_log_probs) else: raise NotImplementedError assert policy_kl_losses.shape.as_list() == [None, 1] self._policy_losses = policy_kl_losses policy_loss = tf.reduce_mean(policy_kl_losses) self._policy_optimizer = tf.train.AdamOptimizer( learning_rate=self._policy_lr, name="policy_optimizer") policy_train_op = self._policy_optimizer.minimize( loss=policy_loss, var_list=self._policy.trainable_variables) self._training_ops.update({'policy_train_op': policy_train_op}) def _init_diagnostics_ops(self): diagnosables = OrderedDict(( ('Q_value', self._Q_values), ('Q_loss', self._Q_losses), ('policy_loss', self._policy_losses), ('alpha', self._alpha) )) diagnostic_metrics = OrderedDict(( ('mean', tf.reduce_mean), ('std', lambda x: tfp.stats.stddev(x, sample_axis=None)), )) self._diagnostics_ops = OrderedDict([ (f'{key}-{metric_name}', metric_fn(values)) for key, values in diagnosables.items() for metric_name, metric_fn in diagnostic_metrics.items() ]) def _init_training(self): self._update_target(tau=1.0) def _update_target(self, tau=None): tau = tau or self._tau for Q, Q_target in zip(self._Qs, self._Q_targets): source_params = Q.get_weights() target_params = Q_target.get_weights() Q_target.set_weights([ tau * source + (1.0 - tau) * target for source, target in zip(source_params, target_params) ]) def _do_training(self, iteration, batch): """Runs the operations for updating training and target ops.""" feed_dict = self._get_feed_dict(iteration, batch) val = self._session.run([self.raw_q_target, self._training_ops], feed_dict) self.value_rms.update(val[0]) if iteration % self._target_update_interval == 0: # Run target ops here. self._update_target() def _get_feed_dict(self, iteration, batch): """Construct TensorFlow feed_dict from sample batch.""" feed_dict = { self._observations_ph: batch['observations'], self._actions_ph: batch['actions'], self._next_observations_ph: batch['next_observations'], self._rewards_ph: batch['rewards'], self._terminals_ph: batch['terminals'], } if self._store_extra_policy_info: feed_dict[self._log_pis_ph] = batch['log_pis'] feed_dict[self._raw_actions_ph] = batch['raw_actions'] if iteration is not None: feed_dict[self._iteration_ph] = iteration return feed_dict def get_diagnostics(self, iteration, batch, training_paths, evaluation_paths): """Return diagnostic information as ordered dictionary. Records mean and standard deviation of Q-function and state value function, and TD-loss (mean squared Bellman error) for the sample batch. Also calls the `draw` method of the plotter, if plotter defined. """ feed_dict = self._get_feed_dict(iteration, batch) diagnostics = self._session.run(self._diagnostics_ops, feed_dict) diagnostics.update(OrderedDict([ (f'policy/{key}', value) for key, value in self._policy.get_diagnostics(batch['observations']).items() ])) if self._plotter: self._plotter.draw() return diagnostics @property def tf_saveables(self): saveables = { '_policy_optimizer': self._policy_optimizer, **{ f'Q_optimizer_{i}': optimizer for i, optimizer in enumerate(self._Q_optimizers) }, '_log_alpha': self._log_alpha, } if hasattr(self, '_alpha_optimizer'): saveables['_alpha_optimizer'] = self._alpha_optimizer return saveables
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., aux_apply='both', aux_tasks=[], aux_lambdas={}): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.norm_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.norm_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Aux Inputs. self.aux_apply = aux_apply self.aux_tasks = aux_tasks self.aux_lambdas = aux_lambdas if 'prop' in self.aux_tasks or 'caus' in self.aux_tasks or 'repeat' in self.aux_tasks: self.obs100 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs100') self.obs101 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs101') self.actions100 = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions100') self.norm_obs100 = tf.clip_by_value( normalize(self.obs100, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.norm_obs101 = tf.clip_by_value( normalize(self.obs101, self.obs_rms), self.observation_range[0], self.observation_range[1]) if 'caus' in self.aux_tasks: self.rewards100 = tf.placeholder(tf.float32, shape=(None, 1), name='rewards100') # Create target networks. target_actor = deepcopy(actor) target_actor.name = 'target_actor' target_actor.repr.name = 'target_actor_repr' self.target_actor = target_actor target_critic = deepcopy(critic) target_critic.name = 'target_critic' target_critic.repr.name = 'target_critic_repr' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(self.norm_obs0) self.normalized_critic_tf = critic(self.norm_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(self.norm_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(self.norm_obs1, target_actor(self.norm_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.norm_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() if self.aux_tasks: logger.info("aux_tasks:{}".format(self.aux_tasks)) self.setup_aux_optimizer() def setup_aux_optimizer(self): logger.info('setting up aux optimizer for actor...') # check if unknown or duplicate aux tasks have been given for task in self.aux_tasks: if not task in ("tc", "prop", "caus", "repeat", "predict"): raise ValueError("!! task {} not implemented !!".format(task)) if self.aux_tasks.count(task) > 1: raise ValueError( "!! multiple tasks {} given, not valid !!".format(task)) self.aux_ops = [] self.aux_losses = tf.Variable(tf.zeros([], dtype=np.float32), name="loss") self.aux_vars = set([]) reprowners = [] if self.aux_apply is 'actor' or 'both': reprowners.append(self.actor) if self.aux_apply is 'critic' or 'both': reprowners.append(self.critic) for owner in reprowners: if any(task in self.aux_tasks for task in ("tc", "prop", "caus", "repeat")): representation = Representation(name=owner.repr.name, layer_norm=owner.layer_norm) self.aux_vars.update(set(representation.trainable_vars)) s0 = representation(self.norm_obs0, reuse=True) if any(task in self.aux_tasks for task in ("tc", "prop", "repeat")): s1 = representation(self.norm_obs1, reuse=True) if any(task in self.aux_tasks for task in ("prop", "caus", "repeat")): s100 = representation(self.norm_obs100, reuse=True) if any(task in self.aux_tasks for task in ("prop", "repeat")): s101 = representation(self.norm_obs101, reuse=True) if 'tc' in self.aux_tasks: # temporal coherence loss is the sum of two terms: # a - loss is present for small state changes brought by big actions # b - loss is present for big state changes brought by small actions # (similarity here is used as inversion mechanism) tc_loss_a = similarity(magnitude(s1 - s0)) * magnitude( self.actions) tc_loss_b = similarity(magnitude( self.actions)) * magnitude(s1 - s0) self.tc_loss = tf.reduce_mean(tc_loss_a + tc_loss_b) self.aux_losses += normalize_loss(self.tc_loss) if 'prop' in self.aux_tasks: # proportionality loss: # punish the difference in magnitude of state change, given action similarity # for two unrelated steps dsmag0 = magnitude(s1 - s0) dsmag100 = magnitude(s101 - s100) dsmagdiff = tf.square(dsmag100 - dsmag0) actmagsim = similarity( magnitude(self.actions100 - self.actions)) self.prop_loss = tf.reduce_mean(dsmagdiff * actmagsim) self.aux_losses += normalize_loss(self.prop_loss) if 'caus' in self.aux_tasks: # causality loss: # punish similarity in state, given action similarity and reward difference # for two unrelated steps s_sim = similarity(magnitude(s100 - s0)) a_sim = similarity(magnitude(self.actions100 - self.actions)) r_diff = magnitude(self.rewards100 - self.rewards) self.caus_loss = tf.reduce_mean(s_sim * a_sim * r_diff) self.aux_losses += normalize_loss(self.caus_loss) if 'repeat' in self.aux_tasks: # repeatability loss: # punish difference in state change, given state and action similarity # for two unrelated steps ds0 = s1 - s0 ds100 = s101 - s100 dsdiff = magnitude(ds100 - ds0) s_sim = similarity(magnitude(s100 - s0)) a_sim = similarity(magnitude(self.actions100 - self.actions)) self.repeat_loss = tf.reduce_mean(dsdiff * s_sim * a_sim) self.aux_losses += normalize_loss(self.repeat_loss) if 'predict' in self.aux_tasks: # prediction loss: # punish the difference between the actual and predicted next step predictor = Predictor(name=owner.name, layer_norm=owner.layer_norm) reconstr = predictor(self.norm_obs0, self.actions, reuse=True) self.pred_loss = tf.nn.l2_loss(self.norm_obs1 - reconstr) self.aux_losses += normalize_loss(self.pred_loss) self.aux_vars.update(set(predictor.trainable_vars)) self.aux_losses = self.aux_losses / (2 * len(self.aux_tasks)) self.aux_vars = list(self.aux_vars) self.aux_grads = U.flatgrad(self.aux_losses, self.aux_vars, clip_norm=self.clip_norm) self.aux_optimizer = MpiAdam(var_list=self.aux_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' param_noise_actor.repr.name = 'param_noise_actor_repr' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_param_noise_actor.repr.name = 'adaptive_param_noise_actor_repr' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(normalize_loss(self.actor_loss), self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(normalize_loss(self.critic_loss), self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. if self.aux_tasks is not None: batch = self.memory.sampletwice(batch_size=self.batch_size) else: batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get gradients DDPG ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] feed_dict = { self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q } actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict=feed_dict) #print("actor grads norm: {}".format(np.linalg.norm(actor_grads))) #print("critic grads norm: {}".format(np.linalg.norm(critic_grads))) # Perform a synced update. self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) auxoutputs = [] # Get gradients AUX if self.aux_tasks: aux_dict = {} aux_ops = {'aux_grads': self.aux_grads} for index, auxtask in enumerate(self.aux_tasks): if auxtask == 'tc': aux_dict.update({ self.obs0: batch['obs0'], self.obs1: batch['obs1'], self.actions: batch['actions'] }) aux_ops.update({'tc': self.tc_loss}) if auxtask == 'prop': aux_dict.update({ self.obs0: batch['obs0'], self.obs1: batch['obs1'], self.obs100: batch['obs100'], self.obs101: batch['obs101'], self.actions: batch['actions'], self.actions100: batch['actions100'] }) aux_ops.update({'prop': self.prop_loss}) if auxtask == 'caus': aux_dict.update({ self.obs0: batch['obs0'], self.obs100: batch['obs100'], self.actions: batch['actions'], self.actions100: batch['actions100'], self.rewards: batch['rewards'], self.rewards100: batch['rewards100'] }) aux_ops.update({'caus': self.caus_loss}) if auxtask == 'repeat': aux_dict.update({ self.obs0: batch['obs0'], self.obs1: batch['obs1'], self.obs100: batch['obs100'], self.obs101: batch['obs101'], self.actions: batch['actions'], self.actions100: batch['actions100'] }) aux_ops.update({'repeat': self.repeat_loss}) if auxtask == 'predict': aux_dict.update({ self.obs0: batch['obs0'], self.obs1: batch['obs1'], self.actions: batch['actions'] }) aux_ops.update({'predict': self.pred_loss}) auxoutputs = self.sess.run(aux_ops, feed_dict=aux_dict) auxgrads = auxoutputs['aux_grads'] # add act and crit grads to auxoutputs auxoutputs['actor_grads'] = actor_grads auxoutputs['critic_grads'] = critic_grads #print("aux grads norm: {}".format(np.linalg.norm(auxgrads))) self.aux_optimizer.update(auxgrads, stepsize=self.actor_lr) return critic_loss, actor_loss, auxoutputs def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class ActorLearner(object): def __init__(self, name, actor, memory, observation_shape, action_shape, gamma=0.95, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), return_range=(-np.inf, np.inf), actor_l2_reg=0., actor_lr=5e-5, clip_norm=None, ): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='expert_actor_obs0') self.action_target = tf.placeholder(tf.float32, shape=(None,) + action_shape, name=name+'action_target') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.return_range = return_range self.observation_range = observation_range self.clip_norm = clip_norm self.batch_size = batch_size self.stats_sample = None self.actor_l2_reg = actor_l2_reg self.actor = actor self.actor_lr = actor_lr # Observation normalization. if self.normalize_observations: with tf.variable_scope(name + 'obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) # Set up parts. self.setup_actor_optimizer() self.setup_stats() self.initial_state = None # recurrent architectures not supported yet def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = tf.reduce_mean(tf.square(self.actor_tf - self.action_target)) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr) self.optimize_expr = self.actor_optimizer.minimize(self.actor_loss, var_list=self.actor.trainable_vars) def setup_stats(self): ops = [] names = [] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] self.stats_ops = ops self.stats_names = names def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss] actor_grads, actor_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.action_target: batch['actions'], }) # with self.graph.as_default(): self.optimize_expr.run(session=self.sess, feed_dict={ self.obs0: batch['obs0'], self.action_target: batch['actions'], } ) return actor_loss def initialize(self, sess): self.sess = sess def save(self, path): save_variables(path) def load(self, path): load_variables(path) def store_transition(self, obs0, action): # B = obs0.shape[0] # for b in range(B): self.memory.append(obs0, action) if self.normalize_observations: self.obs_rms.update(obs0) print("Stored ", obs0.shape) def __call__(self, obs): # with self.graph.as_default(): print("Expert Actor call") feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs)} # import IPython; IPython.embed() action = self.sess.run([self.actor_tf], feed_dict=feed_dict) print("Expert Actor return") return action
class MADDPG(object): def __init__(self, name, actor, critic, memory, obs_space_n, act_space_n, agent_index, obs_rms, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): self.name = name self.num_agents = len(obs_space_n) self.agent_index = agent_index from gym import spaces continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete) # TODO: remove after testing assert continuous_ctrl # Multi-agent inputs # self.obs0 = [] # self.obs1 = [] self.actions = [] # self.norm_obs0_ph = [] # self.norm_obs1_ph = [] self.obs0 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs0") self.obs1 = tf.placeholder(tf.float32, shape=( self.num_agents, None, ) + obs_space_n[self.agent_index].shape, name="obs1") # if continuous_ctrl: # self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action") # else: # act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] # this is required to reshape obs and actions for concatenation obs_shape_list = [self.num_agents] + list( obs_space_n[self.agent_index].shape) act_shape_list = [self.num_agents] + list( act_space_n[self.agent_index].shape) self.obs_shape_prod = np.prod(obs_shape_list) self.act_shape_prod = np.prod(act_shape_list) for i in range(self.num_agents): # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i))) # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i))) if continuous_ctrl: self.actions.append( tf.placeholder(tf.float32, shape=[None] + list(act_space_n[i].shape), name="action" + str(i))) else: self.actions.append( make_pdtype(act_space_n[i]).sample_placeholder( [None], name="action" + str(i))) # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i))) # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i))) # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0") # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1") # we only provide single agent inputs for these placeholders self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. # TODO: need to update the replay buffer storage function to account for multiple agents if self.normalize_observations: self.obs_rms = obs_rms else: self.obs_rms = None # Need to transpose observations so we can normalize them # converts tensor to shape (batch_size, num_agents, space_size) # transose on dim 0 and 1, leave dim 2 unchanged obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2]) obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2]) actions_t = tf.transpose(self.actions, perm=[1, 0, 2]) # each entry in obs_t is normalized wrt the agent normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms), self.observation_range[0], self.observation_range[1]) # convert the obs to original shape after normalization for convenience normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2]) normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2]) # need to specify exact shape, since we dont always pass batch size number of obs/act normalized_obs0_flat = tf.reshape(normalized_obs0, [-1, self.obs_shape_prod]) normalized_obs1_flat = tf.reshape(normalized_obs1, [-1, self.obs_shape_prod]) actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod]) # Return normalization. # TODO: update this to handle multiple agents if required if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. # Each agents gets its own observation self.actor_tf = actor(normalized_act_obs0[self.agent_index]) self.target_actor_tf = target_actor( normalized_act_obs1[self.agent_index]) # Critic gets all observations self.normalized_critic_tf = critic(normalized_obs0_flat, actions_t_flat) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # need to provide critic() with all actions act_input_n = self.actions + [] # copy actions act_input_n[ self. agent_index] = self.actor_tf # update current agent action using its actor act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2]) act_input_n_t_flat = tf.reshape(act_input_n_t, [-1, self.act_shape_prod]) self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat, act_input_n_t_flat, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # we need to use actions for all agents target_act_input_n = self.actions + [] # copy actions target_act_input_n[ self. agent_index] = self.target_actor_tf # update current agent action using its target actor target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2]) target_act_input_n_t_flat = tf.reshape(target_act_input_n_t, [-1, self.act_shape_prod]) Q_obs1 = denormalize( target_critic(normalized_obs1_flat, target_act_input_n_t_flat), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: # param noise is added to actor; hence obs for current agent is required self.setup_param_noise(normalized_act_obs0[self.agent_index]) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names # TODO: need to provide all observations to compute q def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} # feed_dict={ph: [data] for ph, data in zip(self.obs0, obs)} # feed_dict = {self.obs0: [obs]} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action[0], q, None, None # TODO: test this # Computing this every time step may slow things def get_q_value(self, obs_n, act_n): # assuming computing q value for one state; hence need [] around data feed_dict = {ph: [data] for ph, data in zip(self.obs0, obs_n)} act_dict = {ph: [data] for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) q = self.sess.run(self.critic_with_actor_tf, feed_dict=feed_dict) return q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale # print(action) B = obs0.shape[0] a_idx = self.agent_index for b in range(B): self.memory.append(obs0[b][a_idx], action[b][a_idx], reward[b][a_idx], obs1[b][a_idx], terminal1[b][a_idx]) # NOTE: calling update for each agent is ok, since the mean and std are uneffected # this is because the same obs are repeated num_agent times, which dont affect value if self.normalize_observations: # provide full obs for obs_rms update obs0_shape = (len(obs0[b]), ) + obs0[b][a_idx].shape assert obs0_shape == (self.num_agents, ) + obs0[b][a_idx].shape self.obs_rms.update(np.array([obs0[b]])) # TODO: not using this right now def update_obs_rms(self, obs0): if not self.normalize_observations: return B = obs0.shape[0] for b in range(B): # provide full obs for obs_rms update self.obs_rms.update(np.array([obs0[b]])) return def train(self, agents): # generate indices to access batches from all agents replay_sample_index = self.memory.generate_index(self.batch_size) # collect replay sample from all agents obs0_n = [] obs1_n = [] rewards_n = [] act_n = [] terminals1_n = [] for i in range(self.num_agents): # Get a batch. batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) obs1_n.append(batch['obs1']) act_n.append(batch['actions']) # rewards_n.append(batch['rewards']) # terminals1_n.append(batch['terminals1']) batch = self.memory.sample(batch_size=self.batch_size, index=replay_sample_index) # fill placeholders in obs1 with corresponding obs from each agent's replay buffer # self.obs1 and obs1_n are lists of size num_agents # feed_dict={ph: data for ph, data in zip(self.obs1, obs1_n)} feed_dict = {self.obs1: obs1_n} # TODO: find a better way to do this # Get the normalized obs first # norm_obs1 = self.sess.run(self.norm_obs1, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs1_ph: norm_obs1} # feed_dict = {ph: data for ph, data in zip(self.norm_obs1_ph, norm_obs1)} # actions required for critic act_dict = {ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) feed_dict.update({self.rewards: batch['rewards']}) feed_dict.update( {self.terminals1: batch['terminals1'].astype('float32')}) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict=feed_dict) # old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict=feed_dict) # target_Q = self.sess.run(self.target_Q, feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] # generate feed_dict for multiple observations and actions # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} # act_dict={ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(act_dict) feed_dict.update({self.critic_target: target_Q}) actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict=feed_dict) # actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ # self.obs0: batch['obs0'], # self.actions: batch['actions'], # self.critic_target: target_Q, # }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess def agent_initialize(self, sess): self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) # setup saving and loading functions self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self, agents): if self.stats_sample is None: replay_sample_index = self.memory.generate_index(self.batch_size) # collect replay sample from all agents obs0_n, act_n = [], [] for i in range(self.num_agents): batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) act_n.append(batch['actions']) # generate feed_dict for multiple observations and actions # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} actions_dict = {ph: data for ph, data in zip(self.actions, act_n)} feed_dict.update(actions_dict) # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = feed_dict values = self.sess.run(self.stats_ops, feed_dict=self.stats_sample) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self, agents): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. replay_sample_index = self.memory.generate_index(self.batch_size) obs0_n = [] for i in range(self.num_agents): batch = agents[i].memory.sample(batch_size=self.batch_size, index=replay_sample_index) obs0_n.append(batch['obs0']) # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)} feed_dict = {self.obs0: obs0_n} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {self.norm_obs0_ph: norm_obs0} # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} feed_dict.update( {self.param_noise_stddev: self.param_noise.current_stddev}) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict=feed_dict) # distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ # self.obs0: batch['obs0'], # self.param_noise_stddev: self.param_noise.current_stddev, # }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1., 1.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.saver = self.get_saver() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def get_saver(self): exclude = [ 'Variable/ExponentialMovingAverage:0', 'Variable/Adam:0', 'Variable/Adam_1:0', 'Variable_8/ExponentialMovingAverage:0', 'Variable_8/Adam:0' 'Variable/Adam_1:0', ] nodes = tf.trainable_variables() mapping = { var.name.split(':')[0]: var for var in nodes if var.name not in exclude } return tf.train.Saver() def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def save_model(self, path, episode): filename = os.path.join(path, "model.ckpt") self.saver.save(self.sess, filename, episode) print("Saved model to ", filename) def restore_model(self, path): try: checkpoint = tf.train.latest_checkpoint(path) self.saver.restore(self.sess, checkpoint) print("Restored model from ", checkpoint) except Exception as e: print(e) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def q(self, obs): """Compute the q value for some observation""" return self.sess.run(self.critic_with_actor_tf, feed_dict={self.obs0: obs}) def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name="obs0") self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name="obs1") self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name="terminals1") self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name="rewards") self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name="actions") self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name="critic_target") self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name="param_noise_stddev") self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg if self.normalize_observations: with tf.variable_scope("obs_rms"): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) if self.normalize_returns: with tf.variable_scope("ret_rms"): self.ret_rms = RunningMeanStd() else: self.ret_rms = None target_actor = copy(actor) target_actor.name = "target_actor" self.target_actor = target_actor target_critic = copy(critic) target_critic.name = "target_critic" self.target_critic = target_critic self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ["ret_rms_mean", "ret_rms_std"] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ["obs_rms_mean", "obs_rms_std"] ops += [tf.reduce_mean(self.critic_tf)] names += ["reference_Q_mean"] ops += [reduce_std(self.critic_tf)] names += ["reference_Q_std"] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ["reference_actor_Q_mean"] ops += [reduce_std(self.critic_with_actor_tf)] names += ["reference_actor_Q_std"] ops += [tf.reduce_mean(self.actor_tf)] names += ["reference_action_mean"] ops += [reduce_std(self.actor_tf)] names += ["reference_action_std"] if self.param_noise: ops += [tf.reduce_mean(self.pertubed_actor_tf)] names += ["reference_perturbed_action_mean"] ops += [reduce_std(self.pertubed_actor_tf)] names += ["reference_perturbed_action_std"] self.stats_ops = ops self.stats_names = names def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_popart(self): self.old_std = tf.placeholder(tf.float32, shape=[1], name="old_std") new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name="old_mean") new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert "kernel" in M.name assert "bias" in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean)/ new_std)] def setup_critic_optimizer(self): logger.info("setting up critic optimizer") normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0: critic_reg_vars = [var for var in self.critic.trainable_vars if "kernel" in var.name and "output" not in var.name] for var in critic_reg_vars: logger.info(" regularizing: {}".format(var.name)) logger.info(" applying l2 regularization with {}".format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x *y, shape) for shape in critic_shapes]) logger.info(" critic shapes: {}".format(critic_shapes)) logger.info(" critic params: {}".format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_actor_optimizer(self): logger.info("setting up actor optimizer") self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(" actor shapes: {}".format(actor_shapes)) logger.info(" actor params: {}".format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None param_noise_actor = copy(self.actor) param_noise_actor.name = "param_noise_actor" self.pertubed_actor_tf = param_noise_actor(normalized_obs0) logger.info("setting up param noise") self.pertub_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = "adaptive_param_noise_actor" adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def reset(self): if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.pertub_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev }) def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.pertubed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward = self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def adapt_param_noise(self): if self.param_noise is None: return 0. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={self.param_noise_stddev: self.param_noise.current_stddev}) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0 : batch["obs0"], self.param_noise_stddev: self.param_noise.current_stddev }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def train(self): batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict = { self.obs1: batch["obs1"], self.rewards: batch["rewards"], self.terminals1: batch["terminals1"].astype("float32") }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch["obs1"], self.rewards: batch["rewards"], self.terminals1: batch["terminals1"].astype("float32") }) ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch["obs0"], self.actions: batch["actions"], self.critic_target: target_Q }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample["obs0"], self.actions: self.stats_sample["actions"], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats
class DDPG(object): def __init__(self, **params): for k in params: setattr(self, k, params[k]) self.init_args = copy(params) if self.her: # self.obs_to_goal = None # self.goal_idx = None # self.reward_fn = None self.memory = HERBuffer(limit=int(self.buffer_size), action_shape=self.action_shape, observation_shape=self.observation_shape, obs_to_goal=self.obs_to_goal, goal_slice=self.goal_idx, reward_fn=self.reward_fn) else: self.memory = Memory(limit=int(self.buffer_size), action_shape=self.action_shape, observation_shape=self.observation_shape) self.critic = Critic(layer_norm=self.layer_norm) self.actor = Actor(self.action_shape[-1], layer_norm=self.layer_norm) self.action_noise = NormalActionNoise(mu=np.zeros(self.action_shape), sigma=float(self.noise_sigma) * np.ones(self.action_shape)) self.param_noise = None # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + self.observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + self.observation_shape, name='obs1') # self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + self.action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=self.observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) # self.target_Q = self.rewards + (1. - self.terminals1) * self.gamma * Q_obs1 self.target_Q = self.rewards + self.gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run( self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) self.flush() def flush(self): if self.her: self.memory.flush() def get_save_tf(self): all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) return self.sess.run(all_variables) def restore_tf(self, save): all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) restore_ops = [] for x, y in zip(all_variables, save): restore_ops.append(tf.assign(x, y)) self.sess.run(restore_ops) def __getstate__(self): exclude_vars = set(["env"]) args = {} for k in self.init_args: if k not in exclude_vars: args[k] = self.init_args[k] return {'tf': self.get_save_tf(), 'init': args} def __setstate__(self, state): self.__init__(**state['init']) self.sess = tf.InteractiveSession( ) # for now just make ourself a session self.sess.run(tf.global_variables_initializer()) self.restore_tf(state['tf']) self.actor_optimizer.sync() self.critic_optimizer.sync()
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, dropout_on_v, dropout_tau_V, override_reg, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=64, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg ### MAIN CHANGES self.override_reg = override_reg self.dropout_on_v = dropout_on_v self.dropout_tau_V = dropout_tau_V self.observation_shape = observation_shape self.b = tf.placeholder(tf.float32) ### END # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) ### MAIN CHANGES ## Q(s,a) if self.dropout_on_v is not None: self.normalized_critic_tf, _, self.normalized_critic_tf_mc = critic( normalized_obs0, self.actions) else: self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) ## Q(s, mu(s)) if self.dropout_on_v is not None: self.normalized_critic_with_actor_tf, self.normalized_critic_with_actor_tf_avg, _ = critic( normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf_avg, self.return_range[0], self.return_range[1]), self.ret_rms) else: self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.Q = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) if dropout_on_v is not None: Q_obs1, _, _ = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) else: Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 ### END OF CHANGES # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() if replace_masks: update_dropout_masks([ x for x in self.critic.vars if 'dropout' in x.name and 'mask' in x.name ], self.critic.keep_prob, execute=False) update_dropout_masks([ x for x in self.target_critic.vars if 'dropout' in x.name and 'mask' in x.name ], self.critic.keep_prob, execute=False) def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.trainable_vars, self.target_critic.trainable_vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) ### MAIN CHANGES ### eq 10 of the alpha black box dropout if self.dropout_on_v is not None: self.alpha = 0.5 x = normalized_critic_target_tf self.flat = self.normalized_critic_tf_mc flat_stacked = tf.stack(self.flat) # K x M x outsize # M x B X outsize sumsq = U.sum(tf.square(x - flat_stacked), -1) sumsq *= (-.5 * self.alpha * self.dropout_tau_V) self.critic_loss = (-1.0 * self.alpha**-1.0) * logsumexp(sumsq, 0) self.l2_value = self.critic.keep_prob * float( self.batch_size) / (float(self.memory.nb_entries) + 1) self.critic_l2_reg = tf.Variable(self.l2_value, trainable=False) else: self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.override_reg is not None: self.critic_l2_reg = self.override_reg ### END OF CHANGES if self.override_reg is not None: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: ### MAIN CHANGES action, q = self.sess.run([actor_tf, self.Q], feed_dict=feed_dict) # action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) ### END OF CHANGES else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) if replace_masks: update_dropout_masks([ x for x in self.critic.vars if 'dropout' in x.name and 'mask' in x.name ], self.critic.keep_prob) update_dropout_masks([ x for x in self.target_critic.vars if 'dropout' in x.name and 'mask' in x.name ], self.critic.keep_prob) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class TransitionClassifier(object): def __init__(self, ob_size, ac_size, hidden_size=100, log_reward=False, entcoeff=0.001, scope="adversary", dyn_norm=True): self.scope = scope self.ob_size = ob_size self.ac_size = ac_size # self.input_size = ob_size + ac_size self.hidden_size = hidden_size self.log_reward = log_reward self.dyn_norm = dyn_norm self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph) # Build accuracy generator_acc = tf.reduce_mean( tf.cast(tf.nn.sigmoid(generator_logits) < 0.5, tf.float32)) expert_acc = tf.reduce_mean( tf.cast(tf.nn.sigmoid(expert_logits) > 0.5, tf.float32)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy if log_reward: reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) else: reward_op = tf.nn.sigmoid(generator_logits) self.reward = U.function( [self.generator_obs_ph, self.generator_acs_ph], reward_op) lr = tf.placeholder(tf.float32, None) self.trainer = tf.train.AdamOptimizer(learning_rate=lr) gvs = self.trainer.compute_gradients(self.total_loss, self.get_trainable_variables()) self._train = U.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph, lr ], self.losses, updates=[self.trainer.apply_gradients(gvs)]) def build_ph(self): self.generator_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size), name="observations_ph") self.generator_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size), name="actions_ph") self.expert_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size), name="expert_observations_ph") self.expert_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size), name="expert_actions_ph") def build_graph(self, obs_ph, acs_ph): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=[self.ob_size]) obs = normalize(obs_ph, self.obs_rms) _input = tf.concat( [obs, acs_ph], axis=1) # concatenate the two input -> form a transition p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh) p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=None) return logits def get_trainable_variables(self): return tf.trainable_variables(self.scope) def get_reward(self, obs, acs): return np.squeeze(self.reward(obs, acs)) def build_reward_op(self, obs_ph, acs_ph): logits = self.build_graph(obs_ph, acs_ph) if self.log_reward: return -tf.log(1 - tf.nn.sigmoid(logits) + 1e-8) return tf.nn.sigmoid(logits) def set_expert_data(self, data): self.data = Dataset(data, deterministic=False) def train(self, rl_ob, rl_ac, steps=1, lr=3e-4): n = rl_ob.shape[0] loss_buf = [] batch_size = rl_ob.shape[0] // steps for batch in iterbatches([rl_ob, rl_ac], include_final_partial_batch=False, batch_size=batch_size): exp_ob, exp_ac = self.data.next_batch(batch_size) if self.obs_rms and self.dyn_norm: self.obs_rms.update(np.concatenate([exp_ob, rl_ob], axis=0)) loss_buf.append(self._train(*batch, exp_ob, exp_ac, lr)) logger.info(fmt_row(13, self.loss_name)) logger.info(fmt_row(13, np.mean(loss_buf, axis=0)))
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): """ Deep Deterministic Policy Gradien (DDPG) model DDPG: https://arxiv.org/pdf/1509.02971.pdf :param actor: (TensorFlow Tensor) the actor model :param critic: (TensorFlow Tensor) the critic model :param memory: (Memory) the replay buffer :param observation_shape: (tuple) the observation space :param action_shape: (tuple) the action space :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) :param action_noise: (ActionNoise) the action noise type (can be None) :param gamma: (float) the discount rate :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) :param normalize_returns: (bool) should the critic output be normalized :param enable_popart: (bool) enable pop-art normalization of the critic output (https://arxiv.org/pdf/1602.07714.pdf) :param normalize_observations: (bool) should the observation be normalized :param batch_size: (int) the size of the batch for learning the policy :param observation_range: (tuple) the bounding values for the observation :param action_range: (tuple) the bounding values for the actions :param return_range: (tuple) the bounding values for the critic output :param critic_l2_reg: (float) l2 regularizer coefficient :param actor_lr: (float) the actor learning rate :param critic_lr: (float) the critic learning rate :param clip_norm: (float) clip the gradients (disabled if None) :param reward_scale: (float) the value the reward should be scaled by """ # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.target_init_updates = None self.target_soft_updates = None self.critic_loss = None self.critic_grads = None self.critic_optimizer = None self.sess = None # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_q = self.rewards + (1. - self.terminals1) * gamma * q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): """ set the target update operations """ actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): """ set the parameter noise operations :param normalized_obs0: (TensorFlow Tensor) the normalized observation """ assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): """ setup the optimizer for the actor """ logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): """ setup the optimizer for the critic """ logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): """ setup pop-art normalization of the critic output See https://arxiv.org/pdf/1602.07714.pdf for details. Preserving Outputs Precisely, while Adaptively Rescaling Targets”. """ self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_q_outputs_op = [] for out_vars in [ self.critic.output_vars, self.target_critic.output_vars ]: assert len(out_vars) == 2 # wieght and bias of the last layer weight, bias = out_vars assert 'kernel' in weight.name assert 'bias' in bias.name assert weight.get_shape()[-1] == 1 assert bias.get_shape()[-1] == 1 self.renormalize_q_outputs_op += [ weight.assign(weight * self.old_std / new_std) ] self.renormalize_q_outputs_op += [ bias.assign( (bias * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): """ setup the running means and std of the inputs and outputs of the model """ ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def policy(self, obs, apply_noise=True, compute_q=True): """ Get the actions and critic output, from a given observation :param obs: ([float] or [int]) the observation :param apply_noise: (bool) enable the noise :param compute_q: (bool) compute the critic output :return: ([float], float) the action and critic value """ if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_q: action, q_value = self.sess.run( [actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q_value = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q_value def store_transition(self, obs0, action, reward, obs1, terminal1): """ Store a transition in the replay buffer :param obs0: ([float] or [int]) the last observation :param action: ([float]) the action :param reward: (float] the reward :param obs1: ([float] or [int]) the current observation :param terminal1: (bool) is the episode done """ reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): """ run a step of training from batch :return: (float, float) critic loss, actor loss """ # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_q.flatten()) self.sess.run(self.renormalize_q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_q = self.sess.run(self.target_q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_q, }) self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): """ initialize the model parameters and optimizers :param sess: (TensorFlow Session) the current TensorFlow session """ self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): """ run target soft update operation """ self.sess.run(self.target_soft_updates) def get_stats(self): """ Get the mean and standard deviation of the model's inputs and outputs :return: (dict) the means and stds """ if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): """ calculate the adaptation for the parameter noise :return: (float) the mean distance for the parameter noise """ if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): """ Reset internal state after an episode is complete. """ if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf), adaptive_param_noise=True, critic_l2_reg=0., adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False): # Inputs. # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0') # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() """Filewriter summary""" monitor_directory = os.path.join("Experiment_data") self.summary_dir = os.path.join(monitor_directory, "summary") # if restore: # dirname = 'run20' # The last name # self.summary_dir = os.path.join(self.summary_dir, dirname) # else: self.summary_dir = utils.new_summary_dir(self.summary_dir) # record the detailed parameters utils.log_params(self.summary_dir, { "actor learning rate": self.actor_lr, "critic learning rate": self.critic_lr, "batch size": self.batch_size, "actor update rate": self.tau, "critic update rate": self.tau, "action noise": self.action_noise, "param noise": self.param_noise, "reward function": 'General reward function', "result_function": 'The second 100' }) self.merged = tf.summary.merge_all() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None ## 确保假设完全正确 # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.multiply(action, self.action_range) # action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def save_data(self): self.memory.save_data() def train(self, dec_actor_lr, dec_critic_lr): # change the learning rate self.actor_lr = dec_actor_lr self.critic_lr = dec_critic_lr # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) ## wirte the graph self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) def restore_model(self, model_directory, saver, sess): ckpt = tf.train.get_checkpoint_state(model_directory) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) self.sess = sess logger.info('Load the saved model from the directory!!!') self.summary_writer = tf.summary.FileWriter(self.summary_dir) def update_target_net(self): self.sess.run(self.target_soft_updates) def feedback_adptive_explore(self): self.param_noise.adapt_variance() def ou_adaptive_explore(self): self.action_noise.adapt_decrease() def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def log_scalar(self, name, value, index): summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=index)
class Model(object): def __init__(self, sess, policy, dynamics, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape, residual): self.sess = sess self.nenv = nenvs self.residual = residual self.goal_shape = goal_shape self.goal_as_image = goal_as_image = len(goal_shape) == 3 if self.goal_as_image: assert self.goal_shape == ob_space.shape else: logger.info("normalize goal using RunningMeanStd") with tf.variable_scope("RunningMeanStd", reuse=tf.AUTO_REUSE): self.goal_rms = RunningMeanStd(epsilon=1e-4, shape=self.goal_shape) nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.dynamics = dynamics self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="v_next") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") if self.dynamics.dummy: step_goal_placeholder, concat_on_latent, step_goal_encoded = None, None, None else: if goal_as_image: step_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs, ) + ob_space.shape, "step_goal") concat_on_latent, train_goal_encoded, step_goal_encoded = False, None, None else: step_goal_placeholder = tf.placeholder( tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = tf.clip_by_value( (step_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_ob") if self.dynamics.dummy: train_goal_placeholder, concat_on_latent, train_goal_encoded = None, None, None else: if goal_as_image: train_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_goal") concat_on_latent, train_goal_encoded = False, None else: train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * nsteps, ) + goal_shape, "train_goal") concat_on_latent = True train_goal_encoded = tf.clip_by_value( (train_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) logger.info( "======================={}: Aux & Dyna =========================". format(scope)) for var in self.dynamics.params: logger.info(var) logger.info( "======================={}: Aux & Dyna =========================\n" .format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) if not self.dynamics.dummy: _train_dynamics = trainer.minimize(self.dynamics.loss) self.run_ops_dynamics = [ _train_dynamics, self.dynamics.aux_loss, self.dynamics.dyna_loss, ] self.name_ops_dynamics = ["aux_loss", "dyna_loss"] # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, next_obs, actions, rewards, dones, mus, states, masks, steps, goal_obs, verbose=False): cur_lr = self.lr.value_steps(steps) # 1. calculate v_{t+1} using obs_{t+1} and g_t td_map = {self.train_model.X: next_obs} if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") if self.residual: td_map[self.train_model.goals] = goal_obs - next_obs else: td_map[self.train_model.goals] = goal_obs v_next = self.sess.run(self.v, feed_dict=td_map) # 2. use obs_t, goal_t, v_{t+1} to train policy td_map = { self.train_model.X: obs, self.polyak_model.X: obs, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr, self.V_NEXT: v_next } if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") assert hasattr(self.polyak_model, "goals") if hasattr(self, "goal_rms"): self.goal_rms.update(goal_obs) if self.residual: td_map[self.train_model.goals] = goal_obs - obs td_map[self.polyak_model.goals] = goal_obs - obs else: td_map[self.train_model.goals] = goal_obs td_map[self.polyak_model.goals] = goal_obs if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks td_map[self.polyak_model.S] = states td_map[self.polyak_model.M] = masks if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy( )[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] unimportant_key = ["loss_f", "loss_bc"] for name in names_ops_policy.copy(): for suffix in unimportant_key: if name.endswith(suffix): index = names_ops_policy.index(name) names_ops_policy.pop(index) values_ops_policy.pop(index) break return names_ops_policy, values_ops_policy def train_dynamics(self, obs, actions, next_obs, steps, nb_epoch=1): value_ops_dynamics = [] for epoch in range(nb_epoch): cur_lr = self.lr.value_steps(steps) td_map = { self.dynamics.obs: obs, self.dynamics.next_obs: next_obs, self.dynamics.ac: actions, self.LR: cur_lr } value = self.sess.run(self.run_ops_dynamics, td_map)[1:] value_ops_dynamics.append(value) value_ops_dynamics = np.asarray(value_ops_dynamics) value_ops_dynamics = list(np.mean(value_ops_dynamics, axis=0)) return self.name_ops_dynamics.copy(), value_ops_dynamics def step(self, observation, **kwargs): if self.residual and not self.dynamics.dummy: kwargs["goals"] = kwargs["goals"] - observation return self.step_model.evaluate( [self.step_model.action, self.step_model_p, self.step_model.state], observation, **kwargs)
class MlpPolicy(object): recurrent = False def __init__(self, name, *args, **kwargs): self.scope = name with tf.variable_scope(name, reuse=tf.AUTO_REUSE): self._init(*args, **kwargs) def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=False, popart=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("popart"): self.v_rms = RunningMeanStd(shape=[1]) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.norm_vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if popart: self.vpred = denormalize(self.norm_vpred, self.v_rms) else: self.vpred = self.norm_vpred last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.mean_and_logstd = U.function([ob], [self.pd.mean, self.pd.logstd]) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred]) self.use_popart = popart if popart: self.init_popart() ret = tf.placeholder(tf.float32, [None]) vferr = tf.reduce_mean(tf.square(self.vpred - ret)) self.vlossandgrad = U.function([ob, ret], U.flatgrad(vferr, self.get_vf_variable())) def init_popart(self): old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.v_rms.std old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.v_rms.mean renormalize_Q_outputs_op = [] vs = self.output_vars M, b = vs renormalize_Q_outputs_op += [M.assign(M * old_std / new_std)] renormalize_Q_outputs_op += [ b.assign((b * old_std + old_mean - new_mean) / new_std) ] self.renorm_v = U.function([old_std, old_mean], [], updates=renormalize_Q_outputs_op) def act(self, stochastic, ob): ac1, vpred1 = self._act(stochastic, ob[None]) return ac1[0], vpred1[0] def get_mu_logstd(self, ob): mean, logstd = self.mean_and_logstd(ob[None]) return mean[0], logstd[0] def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.trainable_variables(self.scope) def get_initial_state(self): return [] def get_vf_variable(self): return tf.trainable_variables(self.scope + "/vf") def update_popart(self, v_targets): old_mean, old_std = U.get_session().run( [self.v_rms.mean, self.v_rms.std]) self.v_rms.update(v_targets) self.renorm_v(old_std, old_mean) @property def output_vars(self): output_vars = [ var for var in self.get_vf_variable() if 'vffinal' in var.name ] return output_vars def save_policy(self, name): U.save_variables(name, variables=self.get_variables()) def load_policy(self, name): U.load_variables(name, variables=self.get_variables())
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): concat_z = np.zeros((observation_shape[0] - 2)) z = np.zeros((119)) # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.encoding = tf.placeholder(tf.float32, shape=(None, ) + z.shape, name='encoding') self.kls = tf.placeholder(tf.float32, shape=(None, 1), name='kls') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.normalized_obs1 = tf.clip_by_value( normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor # Create target networks. target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. # Main Actor Network self.actor_tf = actor(self.normalized_obs0) # Main Actor network based kl for initial state self.encoder_tf_mu, self.encoder_tf_sigma = actor( self.normalized_obs0, True) # Target Actor network based kl for initial state self.t_encoder_tf_mu, self.t_encoder_tf_sigma = target_actor( self.normalized_obs0, True) # Main Critic Network self.normalized_critic_tf = critic(self.normalized_obs0, self.encoder_tf_mu, self.actions) self.normalized_critic_with_actor_tf = critic(self.normalized_obs0, self.encoder_tf_mu, self.actor_tf, reuse=True) # only for stats self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # Main network based kl of initial state for actor optimization self.kl = tf.reduce_sum(tf.exp(self.encoder_tf_sigma) + tf.square(self.encoder_tf_mu) - 1. - self.encoder_tf_sigma, axis=1) # Target network based kl of initial state for actor optimization self.t0_kl = tf.reshape( tf.reduce_sum(tf.exp(self.t_encoder_tf_sigma) + tf.square(self.t_encoder_tf_mu) - 1. - self.t_encoder_tf_sigma, axis=1), (-1, 1)) Q_obs1 = denormalize( target_critic(self.normalized_obs1, self.t_encoder_tf_mu, self.target_actor(self.normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + 0.0 * self.kls + ( 1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) \ - 0.0 * U.flatgrad(self.kl, self.actor.trainable_vars) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf # find z here as first thing feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def learnt_step(self, obs): actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. print('running sanity check') target_Q_new, new_mean, new_std = self.sess.run( [self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) print(target_Q_new, target_Q, new_mean, new_std) assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: t0_kl = self.sess.run(self.t0_kl, feed_dict={ self.obs0: batch['obs0'], }) target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.obs0: batch['obs0'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), self.kls: t0_kl, }) # Get all gradients and perform a synced update. actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ], feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def save(self, save_path): U.save_variables(save_path, None, self.sess) def load(self, sess, save_path): self.sess = sess U.load_variables(save_path, None, self.sess) self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates)
class DDPG_paramnoise(object): """ Implicit Policy Optimization for DDPG noise injected in the middle of blackbox (param noise) """ def __init__(self, maxactor, maxentactor, critic, classifier, memory, fifomemory, observation_shape, action_shape, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., classifier_l2_reg=0., maxactor_lr=1e-4, maxentactor_lr=1e-4, critic_lr=1e-3, classifier_lr=1e-3, clip_norm=None, reward_scale=1., entropy_coeff=1., beta=0.0): # Inputs. self.obs0_act = tf.placeholder(tf.float32, shape=(1, ) + observation_shape, name='obs0_act') self.obs0_train = tf.placeholder(tf.float32, shape=(batch_size, ) + observation_shape, name='obs0_train') self.obs1 = tf.placeholder(tf.float32, shape=(batch_size, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_act = tf.placeholder(tf.float32, shape=(1, ) + action_shape, name='actions_act') self.actions_train = tf.placeholder(tf.float32, shape=(64, ) + action_shape, name='actions_train') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.fifomemory = fifomemory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.action_shape = action_shape self.critic = critic self.maxactor = maxactor self.maxentactor = maxentactor self.classifier = classifier self.maxactor_lr = maxactor_lr self.maxentactor_lr = maxentactor_lr self.critic_lr = critic_lr self.classifier_lr = classifier_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.classifier_l2_reg = classifier_l2_reg self.entropy_coeff = entropy_coeff # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0_act = tf.clip_by_value( normalize(self.obs0_act, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs0_train = tf.clip_by_value( normalize(self.obs0_train, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.normalized_obs0_act = normalized_obs0_act # record normalized_obs0 self.normalized_obs0_train = normalized_obs0_train self.normalized_obs1 = normalized_obs1 # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_maxactor = copy(maxactor) target_maxentactor = copy(maxentactor) target_maxactor.name = 'target_maxactor' self.target_maxactor = target_maxactor target_maxentactor.name = 'target_maxentactor' self.target_maxentactor = target_maxentactor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.maxactor_tf_act = maxactor(normalized_obs0_act) self.maxentactor_tf_act = maxentactor(normalized_obs0_act) self.maxactor_tf_train = maxactor(normalized_obs0_train, reuse=True) self.maxentactor_tf_train = maxentactor(normalized_obs0_train, reuse=True) nb_actions = maxactor.nb_actions # Create interpolated action for act batch_act = self.maxactor_tf_act.get_shape().as_list()[0] mask_act = tf.random_uniform( tf.stack([batch_act]), minval=0, maxval=1, dtype=tf.float32) < beta self.actor_tf_act = tf.where(mask_act, self.maxactor_tf_act, self.maxentactor_tf_act) # Create interpolated action for train batch_train = self.maxactor_tf_train.get_shape().as_list()[0] mask_train = tf.random_uniform( tf.stack([batch_train ]), minval=0, maxval=1, dtype=tf.float32) < beta self.actor_tf_train = tf.where(mask_train, self.maxactor_tf_train, self.maxentactor_tf_train) # Create graphs for critic for train self.normalized_critic_tf = critic(normalized_obs0_train, self.actions_train) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_maxactor_tf = critic( normalized_obs0_train, self.maxactor_tf_train, reuse=True) self.normalized_critic_with_maxentactor_tf = critic( normalized_obs0_train, self.maxentactor_tf_train, reuse=True) self.normalized_critic_with_actor_tf = critic(normalized_obs0_act, self.actor_tf_act, reuse=True) # act self.critic_with_maxactor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_maxactor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.critic_with_maxentactor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_maxentactor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # Create interpolated target action for train batch_train = normalized_obs0_train.get_shape().as_list()[0] mask_train = tf.random_uniform( tf.stack([batch_train ]), minval=0, maxval=1, dtype=tf.float32) < beta self.target_actions = tf.where( mask_train, self.target_maxactor(normalized_obs1), self.target_maxentactor(normalized_obs1)) Q_obs1 = denormalize( target_critic(normalized_obs1, self.target_actions), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Create graphs for critic for act #self.normalized_critic_tf_act = critic(normalized_obs0_act, self.actions_act) #self.critic_tf_act = denormalize(tf.clip_by_value(self.normalized_critic_tf_act, self.return_range[0], self.return_range[1]), self.ret_rms) # Classifier Network self.random_actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='random_actions') #self.logit = classifier(normalized_obs0_train, self.actor_tf_train) # actions produced by policy for backprop self.logit = classifier(normalized_obs0_train, self.maxentactor_tf_train) self.random_logit = classifier(normalized_obs0_train, self.random_actions, reuse=True) # Set up parts. self.setup_approx_entropy() self.setup_actor_optimizer() self.setup_critic_optimizer() self.setup_classifier_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.maxactor.vars, self.target_maxactor.vars, self.tau) actor_init_updates_, actor_soft_updates_ = get_target_updates( self.maxentactor.vars, self.target_maxentactor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [ actor_init_updates, actor_init_updates_, critic_init_updates ] self.target_soft_updates = [ actor_soft_updates, actor_soft_updates_, critic_soft_updates ] def setup_approx_entropy(self): logger.info('setting up approx entropy') self.approx_entropy = -tf.reduce_mean(self.logit) def setup_actor_optimizer(self): # maxactor logger.info('setting up maxactor optimizer') self.maxactor_loss = -tf.reduce_mean(self.critic_with_maxactor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.maxactor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) # Add entropy into actor loss self.maxactor_grads = U.flatgrad(self.maxactor_loss, self.maxactor.trainable_vars, clip_norm=self.clip_norm) self.maxactor_optimizer = MpiAdam( var_list=self.maxactor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # maxentactor logger.info('setting up maxentactor optimizer') self.maxentactor_loss = -tf.reduce_mean( self.critic_with_maxentactor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.maxentactor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('using entropy coeff {}'.format(self.entropy_coeff)) self.maxentactor_loss += -self.entropy_coeff * self.approx_entropy # Add entropy into actor loss self.maxentactor_grads = U.flatgrad(self.maxentactor_loss, self.maxentactor.trainable_vars, clip_norm=self.clip_norm) self.maxentactor_optimizer = MpiAdam( var_list=self.maxentactor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_classifier_optimizer(self): logger.info('setting up classifier optimizer') #self.classifier_loss = - (tf.reduce_mean(tf.log(1e-8 + tf.sigmoid(self.logit))) # + tf.reduce_mean(tf.log(1e-8 + 1 - tf.sigmoid(self.random_logit)))) label_zeros = tf.zeros_like(self.logit) label_ones = tf.ones_like(self.random_logit) self.classifier_loss = (tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=self.logit, labels=label_zeros)) + tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=self.random_logit, labels=label_ones))) if self.classifier_l2_reg > 0.: classifier_reg_vars = [ var for var in self.classifier.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in classifier_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.classifier_l2_reg)) classifier_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.classifier_l2_reg), weights_list=classifier_reg_vars) self.classifier_loss += classifier_reg classifier_shapes = [ var.get_shape().as_list() for var in self.classifier.trainable_vars ] classifier_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in classifier_shapes]) logger.info(' classifier shapes: {}'.format(classifier_shapes)) logger.info(' classifier params: {}'.format(classifier_nb_params)) self.classifier_grads = U.flatgrad(self.classifier_loss, self.classifier.trainable_vars, clip_norm=self.clip_norm) self.classifier_optimizer = MpiAdam( var_list=self.classifier.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] #ops += [tf.reduce_mean(self.critic_with_actor_tf)] #names += ['reference_actor_Q_mean'] #ops += [reduce_std(self.critic_with_actor_tf)] #names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf_train)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf_train)] names += ['reference_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if apply_noise: actor_tf = self.actor_tf_act # TODO: handle apply_noise=False mode else: actor_tf = self.actor_tf_act # should take the mean?? probably not feed_dict = {self.obs0_act: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) self.fifomemory.append(obs0, action) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get a batch from recent policy then update classifier batch_recent = self.fifomemory.sample(batch_size=self.batch_size) random_actions = np.random.uniform( low=self.action_range[0], high=self.action_range[1], size=[self.batch_size, np.prod(np.array(self.action_shape))]).astype('float32') ops = [ self.classifier_grads, self.classifier_loss, self.approx_entropy ] classifier_grads, classifier_loss, approx_entropy = self.sess.run( ops, feed_dict={ self.obs0_train: batch_recent['obs0'], self.random_actions: random_actions }) self.classifier_optimizer.update(classifier_grads, stepsize=self.classifier_lr) # Get all gradients and perform a synced update. ops = [ self.maxactor_grads, self.maxactor_loss, self.maxentactor_grads, self.maxentactor_loss, self.critic_grads, self.critic_loss ] maxactor_grads, maxactor_loss, maxentactor_grads, maxentactor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0_train: batch['obs0'], self.actions_train: batch['actions'], self.critic_target: target_Q, }) self.maxactor_optimizer.update(maxactor_grads, stepsize=self.maxactor_lr) self.maxentactor_optimizer.update(maxentactor_grads, stepsize=self.maxentactor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, maxactor_loss, maxentactor_loss, classifier_loss, approx_entropy def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.maxactor_optimizer.sync() self.maxentactor_optimizer.sync() self.critic_optimizer.sync() self.classifier_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) #print(self.stats_sample['obs0'].shape, self.stats_sample['actions'].shape) #print(self.obs0_train, self.actions_train) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0_train: self.stats_sample['obs0'], self.actions_train: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) return stats def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset()
class BDDPG(object): def __init__(self, actor, critic, obs_dim, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.95, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.expert_qv = tf.placeholder(tf.float32, shape=(None, 1), name='expert_qv') self.expert_qv1 = tf.placeholder(tf.float32, shape=(None, 1), name='expert_qv1') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.expert_actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='expert_actions') self.expert_actions1 = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='expert_actions1') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = copy(actor) self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.obs_dim = obs_dim # self.critic_obs0 = self.experts[0].obs0 # self.critic_obs1 = self.experts[0].obs1 # self.critic_actor = self.experts[0].use_tf_actor # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic bel0 = self.obs0[:, obs_dim:] bel_dim = observation_shape[0] - obs_dim entropy = -tf.reduce_sum( bel0 * tf.log(bel0 + 1e-3) / math.log(bel_dim), axis=1) / bel_dim # entropy = tf.Print(entropy, [entropy], '>>>> entropy :', summarize=10) entropy = tf.expand_dims(0.1 * entropy, 1) # Create networks and core TF parts that are shared across setup parts. self.actor_tf = entropy * self.actor( normalized_obs0, self.expert_qv) + (1 - entropy) * self.expert_actions self.normalized_critic_tf = critic(normalized_obs0, self.actions, self.expert_qv) self.critic_tf = tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, self.expert_qv) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) bel1 = self.obs1[:, obs_dim:] entropy1 = -tf.reduce_sum( bel1 * tf.log(bel1 + 1e-3) / math.log(bel_dim), axis=1) / bel_dim entropy1 = tf.expand_dims(0.1 * entropy1, 1) action1 = entropy1 * target_actor(normalized_obs1, self.expert_qv1) + ( 1 - entropy1) * self.expert_actions1 self.Q_obs1 = target_critic(normalized_obs1, action1, self.expert_qv1) # self.Q_obs1 = tf.Print(self.Q_obs1, [self.Q_obs1], '>>>> Q :', summarize=10) # self.terminals1 = tf.Print(self.terminals1, [self.terminals1], '>>>> terminal :', summarize=10) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * self.Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0, self.expert_qv) if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): # import IPython; IPython.embed() ; import sys; sys.exit(0) actor_init_updates, actor_soft_updates = get_target_updates( self.actor.perturbable_vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0, expert_qv0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0, expert_qv0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0, expert_qv0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, expert_qv, expert_action, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = { self.obs0: U.adjust_shape(self.obs0, [obs]), self.expert_qv: U.adjust_shape(self.expert_qv, [expert_qv]), self.expert_actions: U.adjust_shape(self.expert_actions, [expert_action]) } if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, expert_qv, action, expert_action, reward, obs1, expert_qv1, expert_action1, terminal1): reward *= self.reward_scale # B = obs0.shape[0] # for b in range(B): # self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) # if self.normalize_observations: # self.obs_rms.update(np.array([obs0[b]])) self.memory.append(obs0, expert_qv, action, expert_action, reward, obs1, expert_qv1, expert_action1, terminal1) if self.normalize_observations: self.obs_rms.update(obs0) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) # import IPython; IPython.embed(); import sys; sys.exit(0) target_Q, Q_obs1 = self.sess.run( [self.target_Q, self.Q_obs1], feed_dict={ self.obs1: batch['obs1'], self.expert_qv1: batch['expert_qv1'], self.expert_actions1: batch['expert_actions1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.expert_qv: batch['expert_qv'], self.actions: batch['actions'], self.expert_actions: batch['expert_actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) # import IPython; IPython.embed(); exit(0) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess # self.graph = graph self.sess.run(tf.global_variables_initializer()) self.setup_actor_optimizer() self.setup_critic_optimizer() self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], self.expert_qv: self.stats_sample['expert_qv'], self.expert_actions: self.stats_sample['expert_actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.expert_actions: batch['expert_actions'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def save(self, path): save_variables(path) def load(self, path): load_variables(path)
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., bc_teacher_lambda=0.0, use_qfilter=False): """DDPG. I changed observation_range to (0, 255) for the image-based RL part because we don't divide our images by 255 until later. The action and return range should be OK. """ # Inputs. Daniel: for images, cast to a new variable which gets cast to the float. # Assumes we detect via observation space; I think MuJoCo envs have obs shape length 1. # Then we let the remainder be input to subsequent code that uses observations. if len(observation_shape) > 1: self.obs0 = tf.placeholder(tf.int32, shape=(None, ) + observation_shape, name='obs0_imgs') self.obs1 = tf.placeholder(tf.int32, shape=(None, ) + observation_shape, name='obs1_imgs') self.obs0_f_imgs = tf.cast(self.obs0, tf.float32) / 255.0 self.obs1_f_imgs = tf.cast(self.obs1, tf.float32) / 255.0 assert not normalize_observations, 'Why normalize if we already divide by 255?' observation_range = (-np.inf, np.inf ) # We don't want to clip raw pixels here. self.use_images = True self.bc_teacher_lambda = bc_teacher_lambda self.use_qfilter = use_qfilter else: # Assuming default MuJoCo settings here. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.use_images = False self.bc_teacher_lambda = bc_teacher_lambda self.actor_l2_reg = 0.0 self.use_qfilter = use_qfilter self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Daniel: new for demos. self.flag_teacher = tf.placeholder(tf.float32, shape=(None, 1), name='flag_teacher') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_l2_reg = actor_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Daniel: this is where all the obs are subsequently passed, thus handle image case. # That way our feed_dicts in later code can still use self.{obs0,obs1}. if self.use_images: normalized_obs0 = tf.clip_by_value( normalize(self.obs0_f_imgs, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( normalize(self.obs1_f_imgs, self.obs_rms), self.observation_range[0], self.observation_range[1]) else: normalized_obs0 = tf.clip_by_value( normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. # One actor. Two critics: action can be from: # (1) itself (supplied via placeholder) -- for critic update, Q(s,a) sampled from RBuffer. # (2) from actor_tf, supplied by the actor -- for actor update which maximizes Q(s,pi(o)). # Then create two de-normalized versions of those critics. # self.critic_tf : Q(s,a) where a is supplied by placeholder # self.critic_with_actor_tf : Q(s,pi(s)) where pi(s) is the actor # Finally, get target Q values from target critic/actor. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Daniel: add a Q-filter, 1 if Q(s,a) > Q(s,pi(s)) where former has `a` from *demonstrator*. Only after pre-training? self.flag_qfilter = tf.cast(self.critic_tf > self.critic_with_actor_tf, tf.float32) self.during_pretrain = tf.placeholder(tf.float32, (), name="during_pretrain_flag") # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): """Make actor loss, grads, and optimizer. Several changes: We use a behavior cloning loss (with a Q-filter on top of that), using actor_tf for the current actor's output given the state, and actions as placeholder for what was sampled from the buffer. The latter might have student actions, in which case we ignore these w/the flag. We apply L2 reg if desired (following DeepMind's DDPGfD). Careful w/variable names if we switch network construction code!! (Nair et al., 2018) set the `bc_teacher_lambda` term I'm using to 1, and average out the BC loss by all items in the batch, *regardless* of whether the item passed the Q-filter or not. We're doing the same here by dividing by the sum of the number of teacher flags. """ logger.info('\nsetting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) if self.bc_teacher_lambda > 0.: # Daniel: add Behavior cloning loss to the actor, but only on teacher samples! # I'm doing a reduce_sum and dividing by the total in the flag teacher. self._diff_m = self.actor_tf - self.actions self._diff_v = tf.reduce_mean(tf.square(self._diff_m), axis=1, keepdims=True) self._diff_f = self._diff_v * self.flag_teacher # Daniel: another idea is to apply q-filters only if we are past pre-training. if self.use_qfilter: logger.info(' applying Q-filter flag: {}'.format( self.flag_qfilter)) self._diff_f = tf.cond( self.during_pretrain > 0.5, lambda: self._diff_f, # pretrain? identity lambda: self._diff_f * self.flag_qfilter ) # else? apply filter self.bc_loss = tf.reduce_sum( self._diff_f) / (tf.reduce_sum(self.flag_teacher) + 1e-6) self.actor_loss += self.bc_loss logger.info(' applying BC loss to actor with {}'.format( self.bc_teacher_lambda)) logger.info(' diff_matrix: {}'.format(self._diff_m)) logger.info(' diff_vector: {}'.format(self._diff_v)) logger.info(' diff_filter: {}'.format(self._diff_f)) if self.actor_l2_reg > 0.: actor_reg_vars = [ var for var in self.actor.trainable_vars if ((var.name.endswith('/w:0') or var.name.endswith( '/kernel:0')) and 'output' not in var.name) ] for var in actor_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.actor_l2_reg)) actor_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.actor_l2_reg), weights_list=actor_reg_vars) self.actor_loss += actor_reg actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}\n'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): """Make critic loss, grads, and optimizer. Minor change w/L2 regularization. I didn't realize that our custom code would name the variables a bit different. It actually makes a huge difference, as the critic's default L2 is 0.01. Just be careful if we decide to re-name the variables or use a different TF construction. """ logger.info('\nsetting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if ((var.name.endswith('/w:0') or var.name.endswith( '/kernel:0')) and 'output' not in var.name) ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}\n'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): """Apply the policy. Note the noise: for DDPG if we are *deploying* it, we should probably set the noise to False, such as for the `--play` option. """ if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() #assert noise.shape == action[0].shape # daniel: with my fix, both are (numenv, acdim) assert noise.shape == action.shape, '{} {}'.format( noise.shape, action.shape) action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1, is_teacher=False): """Store transitions for DDPG. Daniel: collected via VecEnv, so iterate through batch size and append individual components. It's serial but shouldn't be a time bottleneck. Note that all this seems to be done using one-step returns; I don't see n-step returns anywhere. Also, we should add an indication if this is a teacher sample. """ reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b], is_teacher=is_teacher) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self, during_pretrain=False): """Daniel: added during_pretrain in case we want to do anything different there. By default it's false (and float(during_pretrain)=0.0) to maintain backwards compatibility. """ # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = \ self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) ## Daniel: use this for debugging extra DDPG features we implemented: #ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss, # self.critic_tf, self.critic_with_actor_tf, self.flag_teacher, # self.flag_qfilter, self._diff_f, self.actor_tf, self.actions] #actor_grads, actor_loss, critic_grads, critic_loss, Q_demo, Q_actor, flag_t, flag_q, diff_f, act_tf, act_ph = \ # self.sess.run(ops, feed_dict={ # self.obs0: batch['obs0'], # self.actions: batch['actions'], # self.critic_target: target_Q, # self.flag_teacher: batch['flag_teacher'], # self.during_pretrain: float(during_pretrain), #}) #print('\nQ(s,a), Q(s,pi(s)), act_tf, act_ph, diff_f, flag_q, flag_t') #print(Q_demo.T) #print(Q_actor.T) #print('now actors:') #print(act_tf.T) #print(act_ph.T) #print('now diff/flags:') #print(diff_f.T) #print(flag_q.T) #print(flag_t.T) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, self.flag_teacher: batch['flag_teacher'], self.during_pretrain: float(during_pretrain), }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): # Daniel: following PPO2 code outline, hoping to save/load models. self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) # Daniel: back to normal. self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, novelty_reward='AE', normalize_int_rew=False, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) rff_rms_int = RunningMeanStd() nr = NOVELTY_REWARDS[novelty_reward](env.observation_space) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() # Calculate novelty rewards bonus = nr.get_batch_bonus_and_update(seg["ob"]) if normalize_int_rew: rff_rms_int.update(bonus.ravel()) bonus = bonus / rff_rms_int.std.eval() seg["orig_rew"] = seg["rew"] seg["rew"] = seg["rew"] + bonus add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
class RND(object): def __init__(self, name, ph_ob, args): self.convfeat = args.convfeat self.rep_size = args.rep_size self.enlargement = args.enlargement self.proportion_of_exp_used_for_predictor_update = args.proportion_of_exp_used_for_predictor_update self.scope = name with tf.variable_scope(self.scope): self.build_graph = self.build_graph(ph_ob) def build_graph(self, ph_ob): ob = ph_ob[-1] assert len(ob.shape.as_list()) == 4 #B, H, W, C with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob.shape.as_list()[1:3] + [1]) ob_norm = ob[:, :, :, -1:] ob_norm = tf.cast(ob_norm, tf.float32) ob_norm = tf.clip_by_value( (ob_norm - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # Random target network xr = tf.nn.leaky_relu( conv(ob_norm, "c1r", nf=self.convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=self.convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=self.convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=self.rep_size, init_scale=np.sqrt(2)) # Predictor network xrp = tf.nn.leaky_relu( conv(ob_norm, 'c1rp_pred', nf=self.convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=self.convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=self.convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * self.enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * self.enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=self.rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) self._predictor = U.function([ob], [self.int_rew]) def predict(self, ob): obf = ob[-1] if obf.shape == 3: obf = np.expand_dims(obf, 0) int_rew = self._predictor(obf)[0] return int_rew def update_obs_rms(self, ob): obf = np.array(zip(*ob.tolist())[1]) self.ob_rms.update(obf) def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, state_shape, aux_shape, lambda_obj_conf_predict, lambda_gripper_predict, lambda_target_predict, action_noise=None, gamma=0.99, tau=0.001, enable_popart=False, normalize_observations=True, normalize_state=True, normalize_aux=True, batch_size=128, observation_range=(-10., 10.), action_range=(-1., 1.), state_range=(-4, 4), return_range=(-250, 10), aux_range=(-10, 10), critic_l2_reg=0.001, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., replay_beta=0.4, lambda_1step=1.0, lambda_nstep=1.0, nsteps=10, run_name="unnamed_run", lambda_pretrain=0.0, target_policy_noise=0.2, target_policy_noise_clip=0.5, policy_and_target_update_period=2, num_critics=2, **kwargs): # Inputs. self.obs0 = tf.placeholder( tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder( tf.float32, shape=(None,) + observation_shape, name='obs1') self.state0 = tf.placeholder( tf.float32, shape=(None,) + state_shape, name='state0') self.state1 = tf.placeholder( tf.float32, shape=(None,) + state_shape, name='state1') self.terminals1 = tf.placeholder( tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder( tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder( tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder( tf.float32, shape=(None, 1), name='critic_target') self.nstep_steps = tf.placeholder( tf.float32, shape=(None, 1), name='nstep_reached') self.nstep_critic_target = tf.placeholder( tf.float32, shape=(None, 1), name='nstep_critic_target') # Memory debug variables - memory and resident set size. Used # for tensorboard plotting. self.memory_size = tf.placeholder( tf.float32, shape=None, name='memory_size') self.rss = tf.placeholder(tf.float32, shape=None, name='rss') self.aux0 = tf.placeholder( tf.float32, shape=(None,) + aux_shape, name='aux0') self.aux1 = tf.placeholder( tf.float32, shape=(None,) + aux_shape, name='aux1') self.pretraining_tf = tf.placeholder( tf.float32, shape=(None, 1), name='pretraining_tf') self.aux_shape = aux_shape self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_state = normalize_state self.normalize_aux = normalize_aux self.action_noise = action_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor = actor self.actor_lr = actor_lr self.state_range = state_range self.aux_range = aux_range self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.lambda_nstep = lambda_nstep self.lambda_1step = lambda_1step self.lambda_obj_conf_predict = lambda_obj_conf_predict self.lambda_gripper_predict = lambda_gripper_predict self.lambda_target_predict = lambda_target_predict self.nsteps = nsteps self.replay_beta = replay_beta self.run_name = run_name self.lambda_pretrain = lambda_pretrain self.target_policy_noise = target_policy_noise self.target_policy_noise_clip = target_policy_noise_clip self.ep = 0 self.policy_and_target_update_period = policy_and_target_update_period if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None if self.normalize_state: with tf.variable_scope('state_rms'): self.state_rms = RunningMeanStd(shape=state_shape) else: self.state_rms = None if self.normalize_aux: with tf.variable_scope('normalize_aux'): self.aux_rms = RunningMeanStd(shape=aux_shape) else: self.aux_rms = None with tf.name_scope('obs_preprocess'): self.normalized_obs0 = tf.clip_by_value( normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.normalized_obs1 = tf.clip_by_value( normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) with tf.name_scope('state_preprocess'): self.normalized_state0 = tf.clip_by_value( normalize(self.state0, self.state_rms), self.state_range[0], self.state_range[1]) self.normalized_state1 = tf.clip_by_value( normalize(self.state1, self.state_rms), self.state_range[0], self.state_range[1]) with tf.name_scope('aux_preprocess'): self.normalized_aux0 = tf.clip_by_value( normalize(self.aux0, self.aux_rms), self.aux_range[0], self.aux_range[1]) self.normalized_aux1 = tf.clip_by_value( normalize(self.aux1, self.aux_rms), self.aux_range[0], self.aux_range[1]) target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor self.actor_tf, self.obj_conf, self.gripper, self.target = actor( self.normalized_obs0, self.normalized_aux0) next_actions, _, _, _ = target_actor(self.normalized_obs1, self.normalized_aux1) noise = tf.distributions.Normal( tf.zeros_like(next_actions), self.target_policy_noise).sample() noise = tf.clip_by_value( noise, -self.target_policy_noise_clip, self.target_policy_noise_clip, ) # Initialize single/twin critics. self.num_critics = num_critics assert (num_critics == 1 or num_critics == 2) self.critics = [None] * num_critics self.target_critics = [None] * num_critics self.critic_tfs = [None] * num_critics self.critic_with_actor_tfs = [None] * num_critics self.step_1_td_losses = [None] * num_critics self.n_step_td_losses = [None] * num_critics self.td_errors = [None] * num_critics self.critic_losses = [None] * num_critics self.critic_grads = [None] * num_critics self.critic_optimizers = [None] * num_critics Q_obs1s = [None] * num_critics for i in range(num_critics): current_critic = copy(critic) current_critic.name = "critic" + str(i) self.critics[i] = current_critic self.target_critics[i] = copy(current_critic) self.target_critics[i].name = 'target_critic' + str(i) self.critic_tfs[i] = tf.clip_by_value( current_critic(self.normalized_state0, self.actions, self.normalized_aux0), self.return_range[0], self.return_range[1]) self.critic_with_actor_tfs[i] = tf.clip_by_value( current_critic( self.normalized_state0, self.actor_tf, self.normalized_aux0, reuse=True), self.return_range[0], self.return_range[1]) Q_obs1s[i] = self.target_critics[i](self.normalized_state1, next_actions + noise, self.normalized_aux1) if num_critics == 2: minQ = tf.minimum(Q_obs1s[0], Q_obs1s[1]) else: minQ = Q_obs1s[0] self.target_Q = self.rewards + \ (1. - self.terminals1) * tf.pow(gamma, self.nstep_steps) * minQ self.importance_weights = tf.placeholder( tf.float32, shape=(None, 1), name='importance_weights') self.setup_actor_optimizer() self.setup_stats() self.setup_target_network_updates() for i in range(num_critics): self.setup_critic_optimizer(i) self.setup_summaries() def setup_target_network_updates(self): with tf.name_scope('target_net_updates'): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) target_init_updates = [actor_init_updates] target_soft_updates = [actor_soft_updates] for i in range(self.num_critics): init, soft = get_target_updates(self.critics[i].vars, self.target_critics[i].vars, self.tau) target_init_updates.append(init) target_soft_updates.append(soft) self.target_init_updates = target_init_updates self.target_soft_updates = target_soft_updates def setup_actor_optimizer(self): logger.info('setting up actor optimizer') with tf.name_scope('actor_optimizer'): self.action_diffs = tf.reduce_mean( tf.square(self.actions - self.actor_tf), 1) demo_better_than_actor = self.critic_tfs[ 0] > self.critic_with_actor_tfs[0] demo_better_than_actor = self.pretraining_tf * \ tf.cast(demo_better_than_actor, tf.float32) self.bc_loss = ( tf.reduce_sum(demo_better_than_actor * self.action_diffs) * self.lambda_pretrain / (tf.reduce_sum(self.pretraining_tf) + 1e-6)) self.original_actor_loss = - tf.reduce_mean(self.critic_with_actor_tfs[0]) self.obj_conf_loss = tf.reduce_mean( tf.square(self.obj_conf - self.state0[:, 8:11])) * self.lambda_obj_conf_predict self.gripper_loss = tf.reduce_mean( tf.square(self.gripper - self.state0[:, 0:3])) * self.lambda_gripper_predict self.target_loss = tf.reduce_mean( tf.square(self.target - self.state0[:, 3:6])) * self.lambda_target_predict self.actor_loss = self.original_actor_loss + self.bc_loss + \ self.obj_conf_loss + self.gripper_loss + self.target_loss self.number_of_demos_better = tf.reduce_sum( demo_better_than_actor) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad( self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam( var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self, i): with tf.name_scope('critic_optimizer' + str(i)): critic_target_tf = tf.clip_by_value( self.critic_target, self.return_range[0], self.return_range[1]) nstep_critic_target_tf = tf.clip_by_value(self.nstep_critic_target, self.return_range[0], self.return_range[1]) td_error = tf.square(self.critic_tfs[i] - critic_target_tf) self.step_1_td_losses[i] = tf.reduce_mean( self.importance_weights * td_error) * self.lambda_1step nstep_td_error = tf.square(self.critic_tfs[i] - nstep_critic_target_tf) self.n_step_td_losses[i] = tf.reduce_mean( self.importance_weights * nstep_td_error) * self.lambda_nstep self.td_errors[i] = td_error + nstep_td_error self.critic_losses[i] = self.step_1_td_losses[i] + \ self.n_step_td_losses[i] if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critics[i].trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_losses[i] += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critics[i].trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads[i] = U.flatgrad( self.critic_losses[i], self.critics[i].trainable_vars, clip_norm=self.clip_norm) self.critic_optimizers[i] = MpiAdam( var_list=self.critics[i].trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_summaries(self): tf.summary.scalar("actor_loss", self.actor_loss) for i in range(self.num_critics): name_sufffix = str(i) tf.summary.scalar("critic_loss" + name_sufffix, self.critic_losses[i]) tf.summary.scalar("1step_loss" + name_sufffix, self.step_1_td_losses[i]) tf.summary.scalar("nstep_loss" + name_sufffix, self.n_step_td_losses[i]) tf.summary.scalar("percentage_of_demonstrations", tf.reduce_sum(self.pretraining_tf) / self.batch_size) tf.summary.scalar("number_of_demos_better_than_actor", self.number_of_demos_better) tf.summary.histogram("pretrain_samples", self.pretraining_tf) tf.summary.scalar("bc_loss", self.bc_loss) tf.summary.scalar("obj_conf_loss", self.obj_conf_loss) tf.summary.scalar("target_loss", self.target_loss) tf.summary.scalar("gripper_loss", self.gripper_loss) tf.summary.scalar("original_actor_loss", self.original_actor_loss) tf.summary.scalar("memory_size", self.memory_size) tf.summary.scalar("rss", self.rss) self.scalar_summaries = tf.summary.merge_all() # reward self.r_plot_in = tf.placeholder(tf.float32, name='r_plot_in') self.r_plot = tf.summary.scalar("returns", self.r_plot_in) self.r_plot_in_eval = tf.placeholder(tf.float32, name='r_plot_in_eval') self.r_plot_eval = tf.summary.scalar("returns_eval", self.r_plot_in_eval) self.obj_conf_in_eval = tf.placeholder( tf.float32, name='obj_conf_in_eval') self.obj_conf_eval = tf.summary.scalar("obj_conf_eval", self.obj_conf_in_eval) self.grip_in_eval = tf.placeholder(tf.float32, name='grip_in_eval') self.grip_eval = tf.summary.scalar("grip_eval", self.grip_in_eval) self.target_in_eval = tf.placeholder(tf.float32, name='target_in_eval') self.target_eval = tf.summary.scalar("target_eval", self.target_in_eval) self.writer = tf.summary.FileWriter( tmp + '/summaries/' + self.run_name, graph=tf.get_default_graph()) def save_reward(self, r): self.ep += 1 summary = self.sess.run(self.r_plot, feed_dict={self.r_plot_in: r}) self.writer.add_summary(summary, self.ep) def save_aux_prediction(self, obj_conf, grip, target): self.ep += 1 obj_conf_summ, grip_summ, target_summ = self.sess.run( [self.obj_conf_eval, self.grip_eval, self.target_eval], feed_dict={ self.obj_conf_in_eval: obj_conf, self.grip_in_eval: grip, self.target_in_eval: target }) self.writer.add_summary(obj_conf_summ, self.ep) self.writer.add_summary(grip_summ, self.ep) self.writer.add_summary(target_summ, self.ep) def save_eval_reward(self, r, ep): summary = self.sess.run( self.r_plot_eval, feed_dict={self.r_plot_in_eval: r}) self.writer.add_summary(summary, ep) def setup_stats(self): with tf.name_scope('stats'): ops = [] names = [] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tfs[0])] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tfs[0])] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tfs[0])] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tfs[0])] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, aux, state0, apply_noise=True, compute_Q=True): actor_tf = self.actor_tf feed_dict = {self.obs0: [obs], self.aux0: [aux], self.state0: [state0]} if compute_Q: action, q, obj_conf, gripper, target = self.sess.run( [ actor_tf, self.critic_with_actor_tfs[0], self.obj_conf, self.gripper, self.target ], feed_dict=feed_dict) else: action, obj_conf, gripper, target = self.sess.run( [actor_tf, self.obj_conf, self.gripper, self.target], feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, obj_conf, gripper, target def store_transition(self, state, obs0, action, reward, state1, obs1, terminal1, aux0, aux1, i, demo=False): reward *= self.reward_scale if demo: self.memory.append_demonstration(state, obs0, action, reward, state1, obs1, terminal1, aux0, aux1, i) else: assert i is None self.memory.append(state, obs0, action, reward, state1, obs1, terminal1, aux0, aux1, i) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) if self.normalize_state: self.state_rms.update(np.array([state])) if self.normalize_aux: self.aux_rms.update(np.array([aux0])) def train(self, iteration, pretrain=False): batch, n_step_batch, percentage = self.memory.sample_rollout( batch_size=self.batch_size, nsteps=self.nsteps, beta=self.replay_beta, gamma=self.gamma, pretrain=pretrain) target_Q_1step = self.sess.run( self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.state1: batch['states1'], self.aux1: batch['aux1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), self.nstep_steps: np.ones((self.batch_size, 1)), }) target_Q_nstep = self.sess.run( self.target_Q, feed_dict={ self.obs1: n_step_batch['obs1'], self.state1: n_step_batch['states1'], self.aux1: n_step_batch['aux1'], self.rewards: n_step_batch['rewards'], self.nstep_steps: n_step_batch['step_reached'], self.terminals1: n_step_batch['terminals1'].astype('float32'), }) critic_grads = [None] * self.num_critics critic_losses = [None] * self.num_critics td_errors = [None] * self.num_critics # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, *self.critic_grads, *self.critic_losses, *self.td_errors, self.scalar_summaries ] ret = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.importance_weights: batch['weights'], self.state0: batch['states0'], self.aux0: batch['aux0'], self.actions: batch['actions'], self.critic_target: target_Q_1step, self.nstep_critic_target: target_Q_nstep, self.pretraining_tf: batch['demos'].astype('float32'), self.memory_size: len(self.memory.storage), self.rss: resource.getrusage(resource.RUSAGE_SELF).ru_maxrss }) if self.num_critics == 2: actor_grads, actor_loss, critic_grads[0], critic_grads[1], critic_losses[0], critic_losses[1], td_errors[0], \ td_errors[1], scalar_summaries = ret else: actor_grads, actor_loss, critic_grads[0], critic_losses[ 0], td_errors[0], scalar_summaries = ret self.memory.update_priorities(batch['idxes'], td_errors[0]) for i in range(self.num_critics): self.critic_optimizers[i].update( critic_grads[i], stepsize=self.critic_lr) self.writer.add_summary(scalar_summaries, iteration) if iteration % self.policy_and_target_update_period == 0: self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) return critic_losses[0], actor_loss def set_sess(self, sess): self.sess = sess def initialize_vars(self): self.sess.run(tf.global_variables_initializer()) def sync_optimizers(self): self.actor_optimizer.sync() for i in range(self.num_critics): self.critic_optimizers[i].sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: self.stats_sample = self.memory.sample_prioritized( batch_size=self.batch_size, replay_beta=self.replay_beta) values = self.sess.run( self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], self.aux0: self.stats_sample['aux0'], self.state0: self.stats_sample['states0'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) return stats def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() def write_summary(self, summary): agent_summary = { "gamma": self.gamma, "tau": self.tau, "normalize_observations": self.normalize_observations, "normalize_state": self.normalize_state, "normalize_aux": self.normalize_aux, "action_noise": self.action_noise, "action_range": self.action_range, "return_range": self.return_range, "observation_range": self.observation_range, "actor_lr": self.actor_lr, "state_range": self.state_range, "critic_lr": self.critic_lr, "clip_norm": self.clip_norm, "enable_popart": self.enable_popart, "reward_scale": self.reward_scale, "batch_size": self.batch_size, "critic_l2_reg": self.critic_l2_reg, "lambda_nstep": self.lambda_nstep, "lambda_1step": self.lambda_1step, "nsteps": self.nsteps, "replay_beta": self.replay_beta, "run_name": self.run_name, "lambda_pretrain": self.lambda_pretrain, "target_policy_noise": self.target_policy_noise, "target_policy_noise_clip": self.target_policy_noise_clip, "lambda_obj_conf_predict": self.lambda_obj_conf_predict, "lambda_target_predict": self.lambda_target_predict, "lambda_gripper_predict": self.lambda_gripper_predict, } summary["agent_summary"] = agent_summary md_string = self._markdownize_summary(summary) summary_op = tf.summary.text("param_info", tf.convert_to_tensor(md_string)) text = self.sess.run(summary_op) self.writer.add_summary(text) self.writer.flush() print(md_string) @staticmethod def _markdownize_summary(data): result = [] for section, params in data.items(): result.append("### " + section) for param, value in params.items(): result.append("* {} : {}".format(str(param), str(value))) return "\n".join(result)
class DDPG(object): def __init__(self, prefix, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, dis_batch_size=512, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, actor_dis_lr=1e-4, critic_lr=1e-3, exp_scale=1.0, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0_' + prefix) self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1_' + prefix) self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1_' + prefix) self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards_' + prefix) self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions_' + prefix) self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_' + prefix) self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev_' + prefix) self.EXP_SCALE = tf.placeholder(tf.float32, []) # For distillation #self.dis_obs = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='dis_obs_' + prefix) self.dis_actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='dis_actions_' + prefix) self.dis_qs = tf.placeholder(tf.float32, shape=(None, 1), name='dis_qs_' + prefix) self.prefix = prefix # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.actor_dis_lr = actor_dis_lr self.critic_lr = critic_lr self.exp_scale = exp_scale self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.dis_batch_size = dis_batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms_' + self.prefix): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms_' + self.prefix): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor_' + self.prefix self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic_' + self.prefix self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_actor_dis_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor_' + self.prefix self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor_' + self.prefix adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_actor_dis_optimizer(self): logger.info('setting up actor distillation optimizer') self.weights = tf.stop_gradient( tf.clip_by_value( tf.exp( tf.math.scalar_mul(self.EXP_SCALE, self.dis_qs - self.critic_tf)), 0.01, 100)) self.weights = self.weights / tf.reduce_sum(self.weights) self.actor_dis_loss = tf.reduce_sum( tf.math.multiply( self.weights, tf.reduce_mean(tf.square(self.actor_tf - self.dis_actions), axis=1))) actor_dis_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] self.actor_dis_grads = U.flatgrad(self.actor_dis_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_dis_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std_' + self.prefix) new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean_' + self.prefix) new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def pi_batch(self, obs_batch): actor_tf = self.actor_tf feed_dict = {self.obs0: obs_batch} action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) cl0 = critic_loss al0 = actor_loss return cl0, al0 def dis_train(self): if self.partner_agent.memory.nb_entries > 0: batch = self.partner_agent.memory.sample( batch_size=self.dis_batch_size) #print('############ Checking ##############') #print('Batch: ', batch) #print('Batch shape: ', batch['obs0'].shape) obs_batch = batch['obs0'] # Note that, here, the q is denormalized partner_action_batch, partner_q_batch = self.partner_agent.pi_batch( obs_batch) # Actor Distillation ops = [ self.actor_dis_grads, self.actor_dis_loss, self.weights, self.critic_tf ] actor_dis_grads, actor_dis_loss, weights, qs = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.dis_actions: partner_action_batch, self.actions: partner_action_batch, self.dis_qs: partner_q_batch, self.EXP_SCALE: self.exp_scale, }) #print('########## Checking ###########') #for i in range(weights.shape[0]): # print(weights[i], ' ', partner_q_batch[0], ' ', qs[0]) #print('Sum: ', np.sum(weights)) #print('Action Batch: ', action_batch) #print('Actor Distiallation Loss: ', actor_dis_loss) #print('Action Batch: ', action_batch) #print('Q Batch: ', q_batch) self.actor_dis_optimizer.update(actor_dis_grads, stepsize=self.actor_dis_lr) return actor_dis_loss return 0.0 def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) ### For dual imitation def set_partner_agent(self, agent): self.partner_agent = agent
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( critic(normalized_obs1, actor(normalized_obs1, reuse=True), reuse=True), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 self.lag_mult = tf.Variable(1.0, name='lag_mult', trainable=True, import_scope='lag_mult') # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_optimizer(self): logger.info('setting up actor and critic optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) normalized_critic_target_tf = tf.clip_by_value( normalize(self.target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss + self.lag_mult * self.critic_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) self.critic_grads = U.flatgrad(self.actor_loss + self.lag_mult * self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) self.lag_grads = U.flatgrad( self.actor_loss + self.lag_mult * self.critic_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='lag_mult')) self.lag_optimizer = MpiAdam(var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='lag_mult'), beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss, self.lag_grads, self.lag_mult ] actor_grads, actor_loss, critic_grads, critic_loss, lag_grads, lag_mult = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) self.lag_optimizer.update(lag_grads, stepsize=self.actor_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.lag_optimizer.sync() def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma # discount factor self.tau = tau # stepsize for (smooth) updating the target network weights self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr # learning rate for network of actor self.critic_lr = critic_lr # learning rate for network of critic self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # regularization coefficient for network of critic # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return (= reward) normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # TODO: 1- terminals1 ?? # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) # TODO: maybe comment this line return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess # self.sess.run(tf.global_variables_initializer()) # is done in /baselines/ddpg/training.py self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) # values = [241.93886, 5.4122686, -3.6965165, 0.00028637942, -3.4581754, 2.3841858e-07, 0.25359547, 0.3071089, -0.18079321, 0.5772085] names = self.stats_names[:] # ['obs_rms_mean', 'obs_rms_std', 'reference_Q_mean', 'reference_Q_std', 'reference_actor_Q_mean', 'reference_actor_Q_std', 'reference_action_mean', 'reference_action_std', 'reference_perturbed...tion_mean', 'reference_perturbed...ction_std'] assert len(names) == len(values) stats = dict(zip(names, values)) # {'obs_rms_mean': 241.93886, 'obs_rms_std': 5.4122686, 'reference_Q_mean': -3.6965165, 'reference_Q_std': 0.00028637942, 'reference_action_mean': 0.25359547, 'reference_action_std': 0.3071089, 'reference_actor_Q_mean': -3.4581754, 'reference_actor_Q_std': 2.3841858e-07, 'reference_perturbed...tion_mean': -0.18079321, 'reference_perturbed...ction_std': 0.5772085} if self.param_noise is not None: stats.update(self.param_noise.get_stats()) return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., sigma=None, surrogate=False, expected=False, sigma_num_samples=10, random_actor=False, grad_num_samples=10): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.noises = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='noises') self.prev_noises = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='prev_noises') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.sigma = sigma self.expected = expected self.surrogate = surrogate self.random_actor = random_actor # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) if self.sigma is not None and self.action_noise is not None and expected and self.random_actor: critic_with_actor_tf_list = [] for i in range(grad_num_samples): # noise = self.action_noise.memory_noise(self.prev_noises) noise = tf.random_normal(tf.shape(self.actor_tf), mean=0.0, stddev=0.2) noisy_action = self.actor_tf + noise clipped_action = tf.clip_by_value(noisy_action, self.action_range[0], self.action_range[1]) current_critic = critic(normalized_obs0, clipped_action, reuse=True) critic_with_actor_tf = denormalize( tf.clip_by_value(current_critic, self.return_range[0], self.return_range[1]), self.ret_rms) critic_with_actor_tf_list.append(critic_with_actor_tf) self.critic_with_actor_tf = tf.reduce_mean(tf.concat( critic_with_actor_tf_list, axis=1), axis=1, keepdims=True) else: self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) action = target_actor(normalized_obs1) if self.sigma is not None and self.action_noise is not None and expected: # noise = self.action_noise.memory_noise(self.noises, num_samples=num_samples) # preparation for OU noise Q_obs1_list = [] for i in range(sigma_num_samples): if i > 0: reuse = True else: reuse = False # noise = self.action_noise.memory_noise(self.noises) noise = tf.random_normal(tf.shape(action), mean=0.0, stddev=0.2) noisy_action = action + noise clipped_action = tf.clip_by_value(noisy_action, self.action_range[0], self.action_range[1]) Q_obs1_list.append( denormalize( target_critic(normalized_obs1, clipped_action, reuse=reuse), self.ret_rms)) Q_obs1 = tf.reduce_mean(tf.concat(Q_obs1_list, axis=1), axis=1, keepdims=True) else: action = tf.clip_by_value(action, self.action_range[0], self.action_range[1]) Q_obs1 = denormalize(target_critic(normalized_obs1, action), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} action = self.sess.run(actor_tf, feed_dict=feed_dict) action = action.flatten() if self.action_noise is not None and apply_noise: prev_noise = self.action_noise.prev_noise() noise = self.action_noise() assert noise.shape == action.shape action += noise else: noise = None prev_noise = None action = np.clip(action, self.action_range[0], self.action_range[1]) if compute_Q: feed_dict = {self.obs0: [obs], self.actions: [action]} q = self.sess.run([actor_tf], feed_dict=feed_dict) else: q = None return action, q, noise, prev_noise def pi_surrogate(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} actor_action = self.sess.run(actor_tf, feed_dict=feed_dict) actor_action = actor_action.flatten() if self.action_noise is not None and apply_noise: prev_noise = self.action_noise.prev_noise() noise = self.action_noise() assert noise.shape == actor_action.shape action = actor_action + noise else: noise = None prev_noise = None action = np.clip(action, self.action_range[0], self.action_range[1]) if compute_Q: feed_dict = {self.obs0: [obs], self.actions: [action]} q = self.sess.run([actor_tf], feed_dict=feed_dict) else: q = None return action, q, noise, prev_noise, actor_action # # def pi(self, obs, apply_noise=True, compute_Q=True): # if self.param_noise is not None and apply_noise: # actor_tf = self.perturbed_actor_tf # else: # actor_tf = self.actor_tf # feed_dict = {self.obs0: [obs]} # action = self.sess.run(actor_tf, feed_dict=feed_dict) # action = action.flatten() # if self.action_noise is not None and apply_noise: # noise = self.action_noise() # assert noise.shape == action.shape # else: # noise = None # action = np.clip(action, self.action_range[0], self.action_range[1]) # if compute_Q: # feed_dict = {self.obs0: [obs], self.actions: [action]} # q = self.sess.run([actor_tf], feed_dict=feed_dict) # else: # q = None # if self.action_noise is not None and apply_noise: # noise = self.action_noise() # assert noise.shape == action.shape # action += noise # action = np.clip(action, self.action_range[0], self.action_range[1]) # return action , q, noise # def pi(self, obs, apply_noise=True, compute_Q=True): # if self.param_noise is not None and apply_noise: # actor_tf = self.perturbed_actor_tf # else: # actor_tf = self.actor_tf # feed_dict = {self.obs0: [obs]} # if compute_Q: # self.sess.run # action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) # else: # action = self.sess.run(actor_tf, feed_dict=feed_dict) # q = None # action = action.flatten() # if self.action_noise is not None and apply_noise: # noise = self.action_noise() # assert noise.shape == action.shape # action += noise # action = np.clip(action, self.action_range[0], self.action_range[1]) # return action, q, noise def store_transition(self, obs0, action, reward, obs1, terminal1, noise, prev_noise): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1, noise, prev_noise) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), self.noises: batch['noises'], self.prev_noises: batch['prev_noises'] }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), self.noises: batch['noises'], self.prev_noises: batch['prev_noises'] }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.noises: batch['noises'], self.prev_noises: batch['prev_noises'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], self.noises: self.stats_sample['noises'], self.prev_noises: self.stats_sample['prev_noises'] }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })