def _init(self, obs_space, batch_size, time_steps, LSTM_size, laten_size, gaussian_fixed_var=True): ##等会儿要重点看一下var有没有更新 self.pdtype = pdtype = make_pdtype(laten_size) obs = U.get_placeholder("en_ob", dtype=tf.float32, shape = [batch_size, time_steps, obs_space.shape[0]]) # 正则化 with tf.variable_scope("obfilter"): ## 看看有没有起效果,我觉得是其效果考虑的 self.obs_rms = RunningMeanStd(shape=obs_space.shape) obz = tf.clip_by_value((obs - self.obs_rms.mean) / self.obs_rms.std, -5.0, 5.0) lstm_fw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(LSTM_size, forget_bias=1.0) outputs, output_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, obz, dtype=tf.float32) outputs_average = tf.reduce_mean(outputs[0], axis=1) if gaussian_fixed_var and isinstance(laten_size, int): self.mean = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstmfin", U.normc_initializer(1.0)) self.logstd = U.dense(outputs_average, pdtype.param_shape()[0] // 2, "dblstm_logstd", U.normc_initializer(1.0)) # self.logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], # initializer=tf.constant_initializer(0.1)) ##这个地方是不是也是有问题的 pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(outputs_average, pdtype.param_shape()[0], "dblstmfin", U.normc_initializer(0.1)) self.pd = pdtype.pdfromflat(pdparam) self._encode = U.function([obs], self.pd.sample()) self._get_mean = U.function([obs], self.mean)
def _init(self, obs_space, ac_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space.shape[0]) batch_size = None ob = U.get_placeholder(name="ac_de_ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="ac_de_embedding", dtype=tf.float32, shape=[batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到 # 正则化一下 last_out = U.concatenate([ob, embedding], axis=1) with tf.variable_scope("ac_de_filter"): self.ac_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) last_out = tf.clip_by_value( (last_out - self.ac_rms.mean) / self.ac_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size[i], "ac_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space.shape[0], int): self.mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "ac_de_final", U.normc_initializer(1.0)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "ac_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob, embedding], ac) self._get_pol_mean = U.function([ob, embedding], self.mean)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True, k=0.): assert isinstance(ob_space, gym.spaces.Box) self.k = k self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="termfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.5)) logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="intfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.intfc = tf.sigmoid(tf.layers.dense(last_out, num_options, name="intfcfinal", kernel_initializer=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="OP%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax(tf.layers.dense(last_out, num_options, name="OPfinal", kernel_initializer=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, obs_space, embedding_shape, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(obs_space.shape[0]) batch_size = None ob_input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[batch_size, obs_space.shape[0]]) embedding = U.get_placeholder( name="embedding", dtype=tf.float32, shape=[ batch_size, embedding_shape ]) ##这里我觉得是一个embedding 的值扩展成sequence_len大小,暂时先不管,等具体做到这里的时候再处理 last_out = U.concatenate( [ob_input, embedding], axis=1) ##这里只有policy, 没有 value function, 还有这个要看看concatenate的对不对 # 正则化 with tf.variable_scope("state_de_filter"): self.state_rms = RunningMeanStd(shape=obs_space.shape[0] + embedding_shape) input_z = tf.clip_by_value( (last_out - self.state_rms.mean) / self.state_rms.std, -5.0, 5.0) for i in range(num_hid_layers): input_z = tf.nn.tanh( U.dense(input_z, hid_size[i], "state_de%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(obs_space.shape[0], int): self.mean = U.dense(input_z, pdtype.param_shape()[0] // 2, "state_de_final", U.normc_initializer(0.01)) self.logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([self.mean, self.mean * 0.0 + self.logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "state_de_final", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] self._act = U.function([ob_input, embedding], self.pd.sample()) self.get_mean = U.function([ob_input, embedding], self.mean)
def build_graph(self, obs_ph, embedding_z, reuse=False): # ===== 需要先把embedding_z 处理一下 =============== # batch_size = obs_ph.shape[0] embeddings = np.array([embedding_z[0] for _ in range(batch_size)]) with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=self.observation_shape) obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std _input = tf.concat( [obs, embeddings], axis=1) # concatenate the two input -> form a transition # _input = tf.concat([_input, embedding_z], axis=1) _outputs = _input for i in range(self.hidden_layers): _outputs = tf.contrib.layers.fully_connected( _outputs, self.hidden_size[i], activation_fn=tf.nn.tanh) logits = tf.contrib.layers.fully_connected( _outputs, 1, activation_fn=tf.identity) return logits
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): rms = RunningMeanStd(shape=x.shape[1:]) norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) return norm_x, rms
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, vae_pol_mean, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size[i], "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) + vae_pol_mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(0.1)) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, model, hid_size, num_hid_layers, num_options=2, term_prob=0.5, eps=0.0005): assert isinstance(ob_space, gym.spaces.Box) self.state_in = [] self.state_out = [] self.term_prob = term_prob self.num_options = num_options # Creating the policy network sequence_length = None self.ac_dim = ac_space.shape[0] self.model = model self.eps = eps self.trained_options = [] ob = U.get_placeholder(name="ob", dtype=tf1.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf1.int32, shape=[None]) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) with tf1.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf1.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz # Value function for i in range(num_hid_layers[0]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[0], name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] # Intra option policy last_out = ob for i in range(num_hid_layers[1]): last_out = tf1.nn.tanh( tf1.layers.dense(last_out, hid_size[1], name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.2)) logstd = tf1.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=U.normc_initializer(0.1), trainable=True) pdparam = tf1.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) # pdparam = dense3D2(last_out, pdtype.param_shape()[0], "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(-0.6)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf1.placeholder(dtype=tf1.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac]) self.get_vpred = U.function([ob, option], [self.vpred]) self.action_pd = U.function( [ob, option], [self.pd.mode(), self.pd.variance()])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = DiagGaussianPdType(ac_space.shape[0]) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # Critic Network with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = (ob - self.ob_rms.mean) / self.ob_rms.std last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] # Actor Network with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, observation_shape, action_shape, nb_demo_kine, nb_key_states, batch_size=128, noise_type='', actor=None, critic=None, layer_norm=True, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), normalize_returns=False, normalize_observations=True, reward_scale=1., clip_norm=None, demo_l2_reg=0., critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, demo_lr=5e-3, gamma=0.99, tau=0.001, enable_popart=False, save_ckpt=True): # Noise nb_actions = action_shape[-1] param_noise, action_noise = process_noise_type(noise_type, nb_actions) logger.info('param_noise', param_noise) logger.info('action_noise', action_noise) # States recording self.memory = Memory(limit=int(2e5), action_shape=action_shape, observation_shape=observation_shape) # Models self.nb_demo_kine = nb_demo_kine self.actor = actor or Actor( nb_actions, nb_demo_kine, layer_norm=layer_norm) self.nb_key_states = nb_key_states self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm) self.nb_obs_org = nb_key_states # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') # self.critic_target_Q: value assigned by self.target_Q_obs0 self.critic_target_Q = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_Q') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # change in observations self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine] self.obs_delta_kstates = (self.obs1 - self.obs0)[:, :self.nb_key_states] # Parameters. self.gamma = gamma self.tau = tau self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.demo_lr = demo_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.demo_l2_reg = demo_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across set-up parts. # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0) # critic loss # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic( self.normalized_obs0, act_norm(self.actions)) # self.critic_tf: only in logging [reference_Q_mean/std] self.critic_tf = ret_denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # actor loss normalized_critic_with_actor_tf = self.critic(self.normalized_obs0, act_norm(self.actor_tf), reuse=True)[0] # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std] self.critic_with_actor_tf = ret_denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # target Q self.target_action = tf.clip_by_value( target_actor(normalized_obs1)[0], self.action_range[0], self.action_range[1]) self.target_Q_obs1 = ret_denormalize( target_critic(normalized_obs1, act_norm(self.target_action))[0], self.ret_rms) self.target_Q_obs0 = self.rewards + ( 1. - self.terminals1) * gamma * self.target_Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars self.sess = None # Set up checkpoint saver self.save_ckpt = save_ckpt if save_ckpt: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) else: # saver for loading ckpt self.saver = tf.train.Saver() self.main_summaries = tf.summary.merge_all() logdir = logger.get_dir() if logdir: self.train_writer = tf.summary.FileWriter( os.path.join(logdir, 'tb'), tf.get_default_graph()) else: self.train_writer = None
class DDPG(object): def __init__(self, observation_shape, action_shape, nb_demo_kine, nb_key_states, batch_size=128, noise_type='', actor=None, critic=None, layer_norm=True, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), normalize_returns=False, normalize_observations=True, reward_scale=1., clip_norm=None, demo_l2_reg=0., critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, demo_lr=5e-3, gamma=0.99, tau=0.001, enable_popart=False, save_ckpt=True): # Noise nb_actions = action_shape[-1] param_noise, action_noise = process_noise_type(noise_type, nb_actions) logger.info('param_noise', param_noise) logger.info('action_noise', action_noise) # States recording self.memory = Memory(limit=int(2e5), action_shape=action_shape, observation_shape=observation_shape) # Models self.nb_demo_kine = nb_demo_kine self.actor = actor or Actor( nb_actions, nb_demo_kine, layer_norm=layer_norm) self.nb_key_states = nb_key_states self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm) self.nb_obs_org = nb_key_states # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') # self.critic_target_Q: value assigned by self.target_Q_obs0 self.critic_target_Q = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_Q') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # change in observations self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine] self.obs_delta_kstates = (self.obs1 - self.obs0)[:, :self.nb_key_states] # Parameters. self.gamma = gamma self.tau = tau self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.demo_lr = demo_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.demo_l2_reg = demo_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across set-up parts. # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0) # critic loss # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic( self.normalized_obs0, act_norm(self.actions)) # self.critic_tf: only in logging [reference_Q_mean/std] self.critic_tf = ret_denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # actor loss normalized_critic_with_actor_tf = self.critic(self.normalized_obs0, act_norm(self.actor_tf), reuse=True)[0] # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std] self.critic_with_actor_tf = ret_denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # target Q self.target_action = tf.clip_by_value( target_actor(normalized_obs1)[0], self.action_range[0], self.action_range[1]) self.target_Q_obs1 = ret_denormalize( target_critic(normalized_obs1, act_norm(self.target_action))[0], self.ret_rms) self.target_Q_obs0 = self.rewards + ( 1. - self.terminals1) * gamma * self.target_Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars self.sess = None # Set up checkpoint saver self.save_ckpt = save_ckpt if save_ckpt: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) else: # saver for loading ckpt self.saver = tf.train.Saver() self.main_summaries = tf.summary.merge_all() logdir = logger.get_dir() if logdir: self.train_writer = tf.summary.FileWriter( os.path.join(logdir, 'tb'), tf.get_default_graph()) else: self.train_writer = None def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0)[0] logger.debug('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)[0] self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') # loss_normed = -tf.reduce_mean(self.normalized_critic_with_actor_tf) self.actor_Q = tf.reduce_mean(self.critic_with_actor_tf) self.actor_loss = -self.actor_Q tf.summary.scalar('actor/Q', self.actor_Q) # setting up actor vars/grads/optimizer self.actor_vars = self.actor.active_vars self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] self.actor_params = actor_params = [0] * ( len(self.actor.trainable_vars) + 1) for i, shape in enumerate(actor_shapes): actor_params[i + 1] = actor_params[i] + np.prod(shape) n_inact = len(actor_shapes) - len(self.actor_vars) active_params = actor_params[n_inact:] - actor_params[n_inact] logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_params)) logger.info(' actor total: {}'.format(actor_params[-1])) logger.info(' actor active: {}'.format(active_params)) grad = self.actor_grads[active_params[0]:active_params[1]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (n_inact // 2, active_params[1] - active_params[0]), tf.reduce_mean(grad)) grad = self.actor_grads[active_params[-3]:active_params[-2]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (-1, active_params[-2] - active_params[-3]), tf.reduce_mean(grad)) # for train_demo() self.demo_loss = tf.reduce_mean( tf.square(self.obs_delta_kine - self.demo_aprx)) self.demo_max_loss = tf.reduce_max( tf.square(self.obs_delta_kine - self.demo_aprx)) if self.demo_l2_reg > 0.: demo_reg_vars = self.actor.demo_reg_vars for var in demo_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info( ' applying l2 regularization for demo_aprx with {}'.format( self.demo_l2_reg)) self.demo_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.demo_l2_reg), weights_list=demo_reg_vars) self.demo_loss += self.demo_reg else: self.demo_reg = None self.demo_grads = tf_util.flatgrad(self.demo_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.demo_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # mimic rwd self.mimic_rwd = -self.demo_loss tf.summary.scalar('actor/mimic_rwd', self.mimic_rwd) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') self.normalized_critic_target_tf = tf.clip_by_value( ret_normalize(self.critic_target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - self.normalized_critic_target_tf)) tf.summary.scalar('critic_loss/Q_diff', self.critic_loss) if self.normalize_returns: tf.summary.scalar('critic_loss/Q_normed_critic', tf.reduce_mean(self.normalized_critic_tf)) tf.summary.scalar('critic_loss/Q_normed_target', tf.reduce_mean(self.normalized_critic_target_tf)) self.critic_loss_step = 0 diff_rwd = tf.reduce_mean(tf.square(self.pred_rwd - self.rewards)) self.critic_loss_step += diff_rwd tf.summary.scalar('critic_loss/step_rwd', self.critic_loss_step) critic_kine_factor = 100 diff_obs = tf.reduce_mean(tf.square(self.pred_obs_delta - self.obs_delta_kstates), axis=0) diff_obs_kine = tf.reduce_mean( diff_obs[:self.nb_demo_kine]) * critic_kine_factor diff_obs_rest = tf.reduce_mean(diff_obs[self.nb_demo_kine:]) self.critic_loss_step += (diff_obs_kine + diff_obs_rest) tf.summary.scalar( 'critic_loss/step_kstates_kine_x%d' % critic_kine_factor, diff_obs_kine) tf.summary.scalar('critic_loss/step_kstates_rest', diff_obs_rest) tf.summary.scalar('critic_loss/step_total', self.critic_loss_step) self.critic_loss += self.critic_loss_step if self.critic_l2_reg > 0.: critic_reg_vars = self.critic.reg_vars for var in critic_reg_vars: logger.debug(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg tf.summary.scalar('critic_loss/reg', critic_reg) critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_params = [0] * (len(self.critic.trainable_vars) + 1) for i, shape in enumerate(critic_shapes): critic_params[i + 1] = critic_params[i] + np.prod(shape) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_params)) logger.info(' critic total: {}'.format(critic_params[-1])) self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # todo: make the following general grad = self.critic_grads[critic_params[0]:critic_params[1]] tf.summary.scalar( 'grads/critic_layer%d_%d' % (0, critic_params[1] - critic_params[0]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-3]:critic_params[-2]] tf.summary.scalar( 'grads/critic_layer%d_rwd_%d' % (-1, critic_params[-2] - critic_params[-3]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-7]:critic_params[-6]] tf.summary.scalar( 'grads/critic_layer%d_q_%d' % (-1, critic_params[-6] - critic_params[-7]), tf.reduce_mean(grad)) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['zrms/ret_mean', 'zrms/ret_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean[:self.nb_demo_kine]), tf.reduce_mean(self.obs_rms.std[:self.nb_demo_kine]) ] names += ['zrms/obs_kine_mean', 'zrms/obs_kine_std'] ops += [ tf.reduce_mean(self.obs_rms.mean[:self.nb_key_states]), tf.reduce_mean(self.obs_rms.std[:self.nb_key_states]) ] names += ['zrms/obs_kstates_mean', 'zrms/obs_kstates_std'] ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['zrms/obs_mean', 'zrms/obs_std'] # for debugging partial normalisation for o_i in [self.nb_obs_org - 1, self.nb_obs_org]: ops += [self.obs0[0, o_i], self.normalized_obs0[0, o_i]] names += ['zobs_dbg_%d' % o_i, 'zobs_dbg_%d_normalized' % o_i] ops += [tf.reduce_mean(self.critic_tf)] names += ['zref/Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['zref/Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['zref/Q_tf_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['zref/Q_tf_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['zref/action_mean'] ops += [reduce_std(self.actor_tf)] names += ['zref/action_std'] ops += [tf.reduce_mean(self.mimic_rwd)] names += ['zref/mimic_rwd'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['zref/action_ptb_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['zref/action_ptb_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, step, apply_param_noise=True, apply_action_noise=True, compute_Q=True, rollout_log=False): if self.param_noise is not None and apply_param_noise: actor_tf = self.perturbed_actor_tf info = 'ptb' else: actor_tf = self.actor_tf info = 'org' feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() # actor output is [0,1], no need to denormalise. # action = act_denorm(action) if rollout_log: summary_list = [('the_action/%d_rollout_%s' % (i, info), a) for i, a in enumerate(action)] if self.action_noise is not None and apply_action_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise else: noise = None action = np.clip(action, self.action_range[0], self.action_range[1]) if rollout_log: if noise is not None: summary_list += [('the_action/%d_rollout_noise' % i, a) for i, a in enumerate(noise)] self.add_list_summary(summary_list, step) return action, q def store_transition(self, storage, obs0, action, reward, obs1, terminal1): '''store one experience''' reward *= self.reward_scale storage.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def store_multrans(self, storage, obs0, action, reward, obs1, terminal1): '''store multiple experiences''' for i in range(len(reward)): storage.append(obs0[i], action[i], reward[i] * self.reward_scale, obs1[i], terminal1[i]) if self.normalize_observations: self.obs_rms.update(np.vstack(obs0)) def train_demo(self, obs0_pos, obs1_pos, obs0_neg, obs1_neg, step, neg_pct=1.0, lr_decay=1.0): # gradients calculated for pos and neg data separately, then combined for gradient update, # because only positive data are used in eval modes # the loss evaluated here are those before gradient update ops = [ self.demo_grads, self.demo_loss, self.demo_max_loss, self.actor_Q ] pos_grads, demo_loss, max_loss, actor_Q = self.sess.run(ops, feed_dict={ self.obs0: obs0_pos, self.obs1: obs1_pos, }) ops = [self.demo_grads, self.demo_loss] neg_grads, neg_loss = self.sess.run(ops, feed_dict={ self.obs0: obs0_neg, self.obs1: obs1_neg, }) comb_grads = pos_grads - neg_grads * neg_pct self.demo_optimizer.update(comb_grads, stepsize=self.demo_lr * lr_decay) if self.demo_reg is not None: demo_reg = self.sess.run(self.demo_reg) else: demo_reg = 0 # sanity check the training pos_g = pos_grads[self.actor_params[2]:self.actor_params[3]] neg_g = neg_grads[self.actor_params[2]:self.actor_params[3]] comb_g = comb_grads[self.actor_params[2]:self.actor_params[3]] summary_list = [ ('demo_loss/train_pos', demo_loss), ('demo_loss/train_max', max_loss), ('demo_loss/train_neg', neg_loss), ('grads/demo_pos_layer%d_%d' % (1, len(pos_g)), np.mean(pos_g)), ('grads/demo_neg_layer%d_%d' % (1, len(neg_g)), np.mean(neg_g)), ('grads/demo_comb_layer%d_%d' % (1, len(comb_g)), np.mean(comb_g)), ('actor/Q', actor_Q), ('demo_loss/reg', demo_reg) ] self.add_list_summary(summary_list, step) return demo_loss def test_demo(self, obs0, obs1): loss_mean, loss_max = self.sess.run( [self.demo_loss, self.demo_max_loss], feed_dict={ self.obs0: obs0, self.obs1: obs1, }) return loss_mean, loss_max def eval_demo(self, obs0): return self.sess.run(self.demo_aprx, feed_dict={self.obs0: obs0}) def get_mimic_rwd(self, obs0, obs1): mimic_rwd, demo_aprx = self.sess.run([self.mimic_rwd, self.demo_aprx], feed_dict={ self.obs0: obs0, self.obs1: obs1 }) return mimic_rwd, demo_aprx def train_main(self, step): batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: ops = [ self.ret_rms.mean, self.ret_rms.std, self.target_Q_obs0, self.target_Q_obs1 ] old_mean, old_std, target_Q_obs0, target_Q_obs1 = self.sess.run( ops, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q_obs0.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q_obs0, self.ret_rms.mean, self.ret_rms.std], # feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q_obs0, new_mean, new_std) # assert (np.abs(target_Q_obs0 - target_Q_new) < 1e-3).all() else: ops = [self.target_Q_obs0, self.target_Q_obs1] target_Q_obs0, target_Q_obs1 = self.sess.run( ops, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32') }) summary_list = [ ('critic_loss/Q_target_obs1_mean', np.mean(target_Q_obs1)), ('critic_loss/Q_target_obs1_std', np.std(target_Q_obs1)), ('critic_loss/Q_target_obs0_mean', np.mean(target_Q_obs0)), ('critic_loss/Q_target_obs0_std', np.std(target_Q_obs0)) ] self.add_list_summary(summary_list, step) # Get all gradients and perform a synced update. ops = [ self.main_summaries, self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] main_summaries, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target_Q: target_Q_obs0, self.rewards: batch['rewards'], self.obs1: batch['obs1'] }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) if self.train_writer: self.train_writer.add_summary(main_summaries, step) return critic_loss, actor_loss def initialize(self, sess, start_ckpt=None): self.sess = sess if start_ckpt: self.saver.restore(sess, start_ckpt) else: self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.demo_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def store_ckpt(self, save_path, epoch): if self.save_ckpt: self.saver.save(self.sess, save_path, global_step=epoch) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self, storage): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = storage.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.obs1: self.stats_sample['obs1'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self, step): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) self.add_list_summary([('param_noise/distance', mean_distance)], step) self.add_list_summary( [('param_noise/std', self.param_noise.current_stddev)], step) return mean_distance def reset(self): '''Reset internal state after an episode is complete.''' if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def add_list_summary(self, summary_raw, step): def summary_val(k, v): kwargs = {'tag': k, 'simple_value': v} return tf.Summary.Value(**kwargs) if self.train_writer: summary_list = [summary_val(tag, val) for tag, val in summary_raw] self.train_writer.add_summary(tf.Summary(value=summary_list), step)
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1000., 1000.), action_range=(-360., 360.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms', reuse=tf.AUTO_REUSE): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms', reuse=tf.AUTO_REUSE): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. '''the normalization affect intialized policy to be effective, therefore remove it''' # self.actor_tf = actor(normalized_obs0) self.actor_tf, self.res_actor_tf = actor(self.obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.res_actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) _, res_target_actor_action = target_actor(normalized_obs1) Q_obs1 = denormalize( target_critic(normalized_obs1, res_target_actor_action), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf, self.perturbed_res_actor_tf = param_noise_actor( normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf, adaptive_res_actor_tf = adaptive_param_noise_actor( normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.res_actor_tf - adaptive_res_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) # print('var:', self.actor.trainable_vars) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.res_actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.res_actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_res_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_res_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, noise_factor=1., apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: res_actor_tf = self.perturbed_res_actor_tf else: res_actor_tf = self.res_actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, action_res, q = self.sess.run( [self.actor_tf, res_actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action, action_res = self.sess.run([self.actor_tf, res_actor_tf], feed_dict=feed_dict) q = None print('action res: ', action_res) if self.action_noise is not None and apply_noise: noise = self.action_noise() # print('noise: ', noise.shape, action.shape) # assert noise.shape == action.shape #(1,3), (3,) correct addition, no need to assert # print('action, noise: ',action_res, noise) action_res += noise_factor * noise # print(action) # print(action, action_res) action_res = np.clip(action_res, self.action_range[0], self.action_range[1]) action = np.clip(action, self.action_range[0], self.action_range[1]) return action, action_res, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): # print('rs: ', self.reward_scale*np.array([-1])) # reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # print('batch actions: ', batch['actions']) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) print('grads:', actor_grads[0:3]) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) ac = tf.get_default_graph().get_tensor_by_name('actor/dense/kernel:0') ini_ac = tf.get_default_graph().get_tensor_by_name( 'ini_actor/dense/kernel:0') print('weights: ', self.sess.run(ac)[0][0:3], self.sess.run(ini_ac)[0][0:3]) print('loss: ', actor_loss, critic_loss) return critic_loss, actor_loss def update_critic(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.critic_grads, self.critic_loss] critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) # 1. update the eval critic self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) # 2. update the target critic critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.sess.run(critic_soft_updates) return critic_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) #added def save(self, save_path): """ Save the model """ saver = tf.train.Saver() saver.save(self.sess, save_path) def load(self, sess, load_path): self.sess = sess saver = tf.train.Saver() saver.restore(self.sess, load_path) # self.sess=sess def load_ini(self, sess, load_path): """ Load the model """ # variables = tf.contrib.framework.get_variables_to_restore() # non_actor = [v for v in variables if v.name.split('/')[0]!='actor'] # saver = tf.train.Saver(non_actor) # print('Loading ' + load_path) # saver.restore(sess, load_path) self.sess = sess # for v in tf.get_default_graph().as_graph_def().node: # print(v.name) '''Initialize actor policy with supervised policy!''' try: # from the ddpg tensor graph: actor, critic, target_actor, target_critic actor_var_list = tf.contrib.framework.get_variables('ini_actor') except: print('Cannot get variables list!') # print('actor_var:',actor_var_list) try: actor_saver = tf.train.Saver(actor_var_list) actor_saver.restore(self.sess, './model/small/ini') print('Actor Load Succeed!') except: print('Actor Load Failed!') #check if the actor initialization policy has been loaded correctly, i.e. equal to \ # directly ouput values in checkpoint files # loaded_weights=tf.get_default_graph().get_tensor_by_name('actor/mlp_fc0/w:0') # print('loaded_weights:', self.sess.run(loaded_weights)) #init-update once the target_actor network(init_update is fully copy, soft-update accords to tau) self.sess.run(self.target_init_updates)
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates()
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-500., 500.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None # print(action) if self.action_noise is not None and apply_noise: noise = self.action_noise() # assert noise.shape == action.shape # print('ac: ', action, noise) action += noise #no need for clip here # action = np.clip(action, self.action_range[0], self.action_range[1]) # print(action) '''added''' action_set=[] print('action_before_binarization: ', action[0]) #discrete the action to be 0, 1 (binarization) for i in range (int(len(action[0]))): # '''tanh as output''' # # if action[0][i]>0: # # action_set.append(1) # # else: # # action_set.append(0) # '''sigmoid as output''' if action[0][i]>0.5: action_set.append(1) else: action_set.append(0) # print('action: ', action) ''' #DDPG doesnt use argmax to determine action like DQN!!! for i in range (int(len(action[0])/2)): # print(action[0][2*i:2*i+2]) action_set.append(np.argmax(action[0][2*i:2*i+2])) ''' # print('action_set: ', action_set) # action = np.argmax(action[0]) return action_set, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): # print('rs: ', self.reward_scale*np.array([-1])) # reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std : np.array([old_std]), self.old_mean : np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) #added def save(self, save_path): """ Save the model """ saver = tf.train.Saver() saver.save(self.sess, save_path) def load(self,sess, load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.sess = sess
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-500., 500.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet
class DDPG(object): def __init__(self, actor, critic, memory, demon_buffer, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-1000., 1000.), action_range=(-50., 50.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory '''have to use 2 memory here, simply demon_memory = memory will cause a common instantiated memory shared by two variables''' self.demon_memory = demon_buffer self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() # print('noise: ', noise.shape, action.shape) # assert noise.shape == action.shape #(1,3), (3,) correct addition, no need to assert # print(action, noise) action += noise # print(action) action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): demons_memory_ratio = 0.5 # the ratio of demonstrations over all batches sampled # Get a batch from memory batch = self.memory.sample(batch_size=int(2 * self.batch_size * (1 - demons_memory_ratio))) # Get a batch from demonstration buffer demon_batch = self.demon_memory.sample( batch_size=int(2 * self.batch_size * demons_memory_ratio)) # print('memory: ', batch['obs1'].shape, 'demons: ', demon_batch['obs1'].shape) # concatenate two sampled batches batch['obs0'] = np.concatenate((batch['obs0'], demon_batch['obs0'])) batch['rewards'] = np.concatenate( (batch['rewards'], demon_batch['rewards'])) batch['terminals1'] = np.concatenate( (batch['terminals1'], demon_batch['terminals1'])) batch['obs1'] = np.concatenate((batch['obs1'], demon_batch['obs1'])) batch['actions'] = np.concatenate( (batch['actions'], demon_batch['actions'])) # batch = demon_batch if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) # print('loss: ', actor_loss, critic_loss) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) #added def save(self, save_path): """ Save the model """ saver = tf.train.Saver() saver.save(self.sess, save_path) def load(self, sess, load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.sess = sess # def feed_demon2memory(self): # """ # feed demonstrations from data file into memory # """ # with open('data_memory2_21steps.p', 'rb') as f: # data = pickle.load(f) # for _, episode in enumerate(data): # for _, step in enumerate(episode): # # state, action, reward, new_state, done # self.store_transition(np.array(step[0]), step[1], step[2], step[3], step[4]) def store_transition2demon(self, obs0, action, reward, obs1, terminal1): B = obs0.shape[0] for b in range(B): self.demon_memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def feed_demon_buffer(self): ''' sample from the demonstration data instead of feeding them into the memory ''' batch = {} with open('data_memory2_21steps.p', 'rb') as f: data = pickle.load(f) for _, episode in enumerate(data): for _, step in enumerate(episode): # state, action, reward, new_state, done self.store_transition2demon(np.array(step[0]), step[1], step[2], step[3], step[4])