def get_loss(self): ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
def predict_next(self, reuse): if isinstance(self.ac_space, gym.spaces.Discrete): ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2) else: ac = self.ac sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope, reuse=reuse): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) return x
def get_loss(self): nl = tf.nn.leaky_relu ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat([ x, ac_four_dim + tf.zeros([ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32) ], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) x = unflatten_first_dim(x, sh) self.prediction_pixels = x * self.ob_std + self.ob_mean return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, [2, 3, 4])
def predict_next(self, reuse): nl = tf.nn.leaky_relu if isinstance(self.ac_space, gym.spaces.Discrete): ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2) else: ac = self.ac sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat([ x, ac_four_dim + tf.zeros([ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32) ], axis=-1) with tf.variable_scope(self.scope, reuse=reuse): x = flatten_two_dims(self.features) x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) x = unflatten_first_dim(x, sh) self.prediction_pixels = x * self.ob_std + self.ob_mean # return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, [2, 3, 4]) return x
def get_loss(self, ac): sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu, reuse=tf.AUTO_REUSE) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu, reuse=tf.AUTO_REUSE) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None, reuse=tf.AUTO_REUSE) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None, reuse=tf.AUTO_REUSE) x = unflatten_first_dim(x, sh) return x
def get_loss(self): nl = tf.nn.leaky_relu ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat( [ x, ac_four_dim + tf.zeros( [ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32, ), ], axis=-1, ) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) mu, log_sigma_squared = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) mu = unflatten_first_dim(mu, sh) log_sigma_squared = unflatten_first_dim(log_sigma_squared, sh) prediction_pixels = mu * self.ob_std + self.ob_mean if self.ama == "true": mse = tf.square(mu - 2 * tf.stop_gradient(self.out_features)) dynamics_reward = tf.reduce_mean((mse - tf.exp(log_sigma_squared)), axis=[2, 3, 4]) if self.clip_ama == "true": dynamics_reward = tf.clip_by_value(dynamics_reward, 0, 1e6) loss = tf.reduce_mean( (tf.exp(-log_sigma_squared) * (mse) + self.uncertainty_penalty * log_sigma_squared), axis=[2, 3, 4], ) elif self.ama == "false": mse = tf.square(mu - tf.stop_gradient(self.out_features)) dynamics_reward = tf.reduce_mean(mse, axis=[2, 3, 4]) loss = dynamics_reward else: raise ValueError("Please specify whether to use AMA or not") return ( loss, dynamics_reward, prediction_pixels, log_sigma_squared, )
def prior_regularization(self, prior, sigma_mu=1e4, sigma_sigma=1e-4): """ 对 prior network 输出的分布进行约束. 在没有该约束的情况下, 模型一般也不会发散. 该正则项对原损失函数的影响很小, 几乎不影响学习的过程, 推荐使用. 对应于论文 4.3.2 内容 """ mu = flatten_two_dims(prior.mean()) # (None, 128) sigma = flatten_two_dims(prior.stddev()) # (None, 128) mu_regularise = -tf.reduce_sum(mu**2, axis=-1) / (2 * (sigma_mu**2)) sigma_regularise = tf.reduce_sum(tf.math.log(sigma) - sigma, axis=-1) * sigma_sigma reg = mu_regularise + sigma_regularise # shape=(None,) return tf.reshape(reg, (self.sh[0], self.sh[1])) # shape=(None,None)
def decoder(self, z): # z 是VAE后验分布的均值, shape=(None,None,512) nl = tf.nn.leaky_relu z_has_timesteps = (z.get_shape().ndims == 3) if z_has_timesteps: sh = tf.shape(z) z = flatten_two_dims(z) # (None,512) with tf.variable_scope(self.scope + "decoder"): # 反卷积网络. de-convolution. spherical_obs=True, 输出 z.shape=(None,84,84,4) z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: # 球形损失, scale 在所有维度都是同一个常数, 简化运算 scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32, initializer=tf.ones_initializer()) scale = tf.maximum(scale, -4.) scale = tf.nn.softplus(scale) scale = scale * tf.ones_like(z) else: z, scale = tf.split(z, 2, -1) # 输出 split, 分别作为 mu 和 scale. scale = tf.nn.softplus(scale) # scale = tf.Print(scale, [scale]) return tf.distributions.Normal(loc=z, scale=scale)
def get_last_features(self, x, reuse): x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) #with tf.variable_scope(self.scope + "_features", reuse=reuse): with tf.variable_scope(self.scope+"_features", reuse=reuse): x = (tf.to_float(x) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if x_has_timesteps: x = unflatten_first_dim(x, sh) x = tf.reshape(x, [-1, sh[1], self.feat_dim]) with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): init_1 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_1, self.last_h_in_1) if self.lstm2_size: init_2 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_2, self.last_h_in_2) if self.aux_input: prev_rews = tf.expand_dims(self.ph_last_rew, -1) x = tf.concat([x, prev_rews], -1) x, c_out_1, h_out_1 = lstm(self.lstm1_size)(x, initial_state=init_1) if self.lstm2_size: if self.aux_input: prev_acs = tf.one_hot(self.ph_last_ac, depth=self.num_actions) x = tf.concat([x, tf.cast(prev_acs, tf.float32)], -1) x = tf.concat([x, self.ph_last_vel], -1) x, c_out_2, h_out_2 = lstm(self.lstm2_size)(x, initial_state=init_2) return x
def __init__(self, ob_space, ac_space, hidsize, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print("Warning: policy is operating on top of layer-normed features. It might slow down the training.") self.layernormalize = layernormalize self.nl = nl with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def decoder(self, z): nl = tf.nn.leaky_relu z_has_timesteps = (z.get_shape().ndims == 3) if z_has_timesteps: sh = tf.shape(z) z = flatten_two_dims(z) with tf.variable_scope(self.scope + "decoder"): z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32, initializer=tf.ones_initializer()) scale = tf.maximum(scale, -4.) scale = tf.nn.softplus(scale) scale = scale * tf.ones_like(z) else: z, scale = tf.split(z, 2, -1) scale = tf.nn.softplus(scale) # scale = tf.Print(scale, [scale]) return tf.distributions.Normal(loc=z, scale=scale)
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy", nlstm=256): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): # h, self.dropout_assign_ops = choose_cnn(processed_x) # xs = batch_to_seq(h, nenv, nsteps) # ms = batch_to_seq(M, nenv, nsteps) # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) # h5 = seq_to_batch(h5) # vf = fc(h5, 'v', 1)[:,0] # self.pd, self.pi = self.pdtype.pdfromlatent(h5) x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def get_loss_t2(self): ac = tf.one_hot(self.auxiliary_task.policy.a_samp_alt, self.ac_space.n, axis=2) self.next_pred = self.get_loss(ac) self.next_pred_flat = flatten_two_dims(self.next_pred) return tf.reduce_mean( (self.next_pred - tf.stop_gradient(self.out_features))**2, -1)
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.n_env = n_env self.n_steps = n_steps self.n_batch = n_env * n_steps self.n_lstm = n_lstm self.reuse = reuse with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space # self.ac_pdtype = make_pdtype(ac_space) self.ac_pdtype = make_proba_dist_type(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(self.n_env, self.n_steps) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder( [self.n_env, self.n_steps], name='ac') self.masks_ph = tf.placeholder(tf.float32, [self.n_env, self.n_steps], name="masks_ph") # mask (done t-1) self.flat_masks_ph = tf.reshape(self.masks_ph, [self.n_env * self.n_steps]) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2], name="states_ph") # states self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] self.sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=self.reuse) self.features = unflatten_first_dim(self.flat_features, self.sh)
def get_loss(self): with tf.variable_scope(self.scope): x = tf.concat([self.features, self.next_features], 2) sh = tf.shape(x) x = flatten_two_dims(x) x = fc(x, units=self.policy.hidsize, activation=activ) x = fc(x, units=self.ac_space.n, activation=None) param = unflatten_first_dim(x, sh) idfpd = self.policy.ac_pdtype.pdfromflat(param) return idfpd.neglogp(self.ac)
def get_loss(self, reuse=False): with tf.variable_scope(self.scope, reuse=reuse): x = tf.concat([self.features, self.next_features], 2) sh = tf.shape(x) x = flatten_two_dims(x) x = fc(x, units=self.policy.hidsize, activation=activ) # x = fc(x, units=self.ac_space.n, activation=None) x = fc(x, units=get_action_n(self.ac_space), activation=None) param = unflatten_first_dim(x, sh) # idfpd = self.policy.ac_pdtype.pdfromflat(param) idfpd = self.policy.ac_pdtype.proba_distribution_from_flat(param) return idfpd.neglogp(self.ac)
def get_features(self, x, reuse): nl = tf.nn.leaky_relu x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + "_features", reuse=reuse): x = (tf.to_float(x) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=nl, layernormalize=False) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_features(self, x, reuse): x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + "_features", reuse=reuse): x = tf.to_float(x) x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_loss(self): ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) ##################################################### #ps = (tf.reduce_mean(tf.stop_gradient(self.out_features), -1)) #print("reward: ", tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, -1).shape) ##################################################### # tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, -1), buf_ac return tf.reduce_mean( (x - tf.stop_gradient(self.out_features))**2, -1 ), tf.stop_gradient( self.features ) # 84 x 84 x 128 x128 int x: state prediction - non update next obs RMS -> reward : 128(pararellel thread) x 128(rollouts length)
def get_features(self, x, reuse): if (x.get_shape().ndims == 5): shape = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + '_features', reuse=reuse): x = (tf.cast(x, tf.float32) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if (x.get_shape().ndims == 5): x = unflatten_first_dim(x, shape) return x
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope='policy'): self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.hidsize = hidsize self.feat_dim = feat_dim with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.placeholder_observation = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='observation') self.placeholder_action = self.ac_pdtype.sample_placeholder( [None, None], name='action') self.pd = self.vpred = None self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] shape = tf.shape(self.placeholder_observation) x = flatten_two_dims(self.placeholder_observation) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, shape) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) value_pred = fc(x, name='value_func_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, shape) self.vpred = unflatten_first_dim(value_pred, shape)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def get_loss(self): # 构造逆环境模型, 流程 输入 [feature(obs), feature(obs_next)] -> 输出动作参数 # 计算不同动作的高斯或者softmax分布 -> 计算 log_prob 作为 inverse dynamics 的损失. with tf.variable_scope(self.scope): # features.shape=(None,None,512), next_features.shape=(None,None,512), x = tf.concat([self.features, self.next_features], 2) # x.shape=(None,None,1024) sh = tf.shape(x) x = flatten_two_dims(x) # (None, 1024) 融合了 feature 和 next_feature x = fc(x, units=self.policy.hidsize, activation=activ) # (None,512) x = fc(x, units=self.ac_space.n, activation=None) # (None,4) 输出动作logits param = unflatten_first_dim(x, sh) # (None,None,4) 恢复维度 idfpd = self.policy.ac_pdtype.pdfromflat(param) # 根据输出 logits 建立分布 # 如果是连续动作空间,这里代表高斯-log损失; 如果是离散动作空间, 这里代表 softmax 损失 return idfpd.neglogp(self.ac) # shape等于前2个维度 (None,None)
def set_dynamics(self, dynamics): self.dynamics = dynamics with tf.variable_scope(self.scope): shaped = tf.shape(self.ph_ob) flat = flatten_two_dims(self.ph_ob) features = self.dynamics.auxiliary_task.get_features(flat, reuse=tf.AUTO_REUSE) pdparam = self.get_pdparam(features, False) pdparam = unflatten_first_dim(pdparam, shaped) self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp) '''Alternate ac for forward dynamics''' pdparam_alt = self.get_pdparam(self.extracted_features, True) pdparam_alt = unflatten_first_dim(pdparam_alt, shaped) self.a_samp_alt = self.ac_pdtype.pdfromflat(pdparam_alt).sample()
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print("Warning: policy is operating on top of layer-normed features. It might slow down the training.") self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std ''' Defining variables that'll be initialized with dynamics ''' self.dynamics = None self.a_samp = None self.entropy = None self.nlp_samp = None self.a_samp_alt = None with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) self.extracted_features = tf.placeholder(dtype=tf.float32, shape=self.flat_features.shape) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) vpred = fc(x, name='value_function_output', units=1, activation=None) y = fc(vpred, units=hidsize, activation=activ) y = fc(y, units=hidsize, activation=activ) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
def get_loss(self): sh = tf.shape(self.features) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) # def residual(x): # res = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) # res = tf.layers.dense(x, self.hidsize, activation=None) # return x + res # for _ in range(4): # x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(x, n_out_features, activation=None) x = unflatten_first_dim(x, sh) return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
def get_loss(self): with tf.variable_scope(self.scope): f_mean, f_logvar = tf.split(self.features, 2, -1) next_f_mean, next_f_logvar = tf.split(self.next_features, 2, -1) f_scale = tf.nn.softplus(f_logvar) f_distribution = tf.distributions.Normal(loc=f_mean, scale=f_scale) next_f_scale = tf.nn.softplus(next_f_logvar) next_f_distribution = tf.distributions.Normal(loc=next_f_mean, scale=next_f_scale) sh = tf.shape(f_mean) prior = tf.distributions.Normal(loc=tf.zeros(sh), scale=tf.ones(sh)) kl_loss = tf.distributions.kl_divergence(f_distribution, prior) f_sample = f_distribution.sample() ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) x = tf.concat([f_sample, ac], 2) sh = tf.shape(x) x = flatten_two_dims(x) x = fc(x, units=self.policy.hidsize, activation=activ) x = fc(x, units=2 * self.feat_dim, activation=None) x = unflatten_first_dim(x, sh) mean, logvar = tf.split(x, 2, -1) scale = tf.nn.softplus(logvar) post_distribution = tf.distributions.Normal(loc=mean, scale=scale) return self.beta * kl_loss + tf.distributions.kl_divergence(next_f_distribution, post_distribution)
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.bool_actionclip = True #TODO Need to make this flexible self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std #self.ac_range = ac_range with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype( ac_space ) #RS: Should give a continuous action space, given a continuous action env self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=tf.nn.tanh) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.a_samp = self.clip_action( self.a_samp) if self.bool_actionclip else self.a_samp self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp) self.pd_logstd = pd.logstd self.pd_std = pd.std self.pd_mean = pd.mean
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): """ ob_space: (84,84,4); ac_space: 4; ob_mean.shape=(84,84,4); ob_std=1.7是标量; hidsize: 512; feat_dim: 512; layernormalize: False; nl: tf.nn.leaky_relu. """ if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype( ac_space) # 离散动作空间为soft-max分布, 连续状态空间为高斯分布 self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') # 初始化 self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[ 0] # breakout中等于4. 维度, 在soft-max情况下等于动作空间的维度 sh = tf.shape(self.ph_ob) # ph_ob.shape = (None,None,84,84,4) x = flatten_two_dims(self.ph_ob) # x.shape = (None,84,84,4) 将前2维合并 self.flat_features = self.get_features( x, reuse=False) # shape=(None,512) self.features = unflatten_first_dim(self.flat_features, sh) # shape=(None,None,512) # 定义策略网络和值函数网络. 其输入时已经提取过特征的 feature, 而不是原始的输入. with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) # activ=tf.nn.relu x = fc(x, units=hidsize, activation=activ) # 分成 策略和值函数 pdparam = fc(x, name='pd', units=pdparamsize, activation=None) # 动作logits, shape=(None,4) vpred = fc(x, name='value_function_output', units=1, activation=None) # 值函数, 线性单元, shape=(None,1) pdparam = unflatten_first_dim(pdparam, sh) # shape=(None,None,4) self.vpred = unflatten_first_dim( vpred, sh)[:, :, 0] # 值函数, 由于最后一维为1, 因此不要. shape=(None,None) self.pd = pd = self.ac_pdtype.pdfromflat( pdparam) # 策略输出softmax分布. 有mean,neglogp,kl,entropy,sample等函数 self.a_samp = pd.sample() # 采样动作,int型 (None,None), 每个位置是标量 self.entropy = pd.entropy() # 熵. (None,None) self.nlp_samp = pd.neglogp( self.a_samp) # -log pi(a|s) (None,None)
def get_loss_t1(self): ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) self.first_pred = self.get_loss(ac) self.first_pred_flat = flatten_two_dims(self.first_pred) return tf.reduce_mean( (self.first_pred - tf.stop_gradient(self.out_features))**2, -1)
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): # hidsize: all hidsize in fcn # feat_dim: feature dimension # nl: non-linear if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl # non-linear self.ob_mean = ob_mean self.ob_std = ob_std with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) # the ac_pdtype does not contain any information about ob space self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope # pdparamsize: the number of pdparams pdparamsize = self.ac_pdtype.param_shape()[0] # sh: [None, None, h, w, c] sh = tf.shape(self.ph_ob) # ob: [None, None, h, w, c] # x: [None, h, w, c] x = flatten_two_dims(self.ph_ob) # flat_features returns the feature with shape [None, feat_dim] self.flat_features = self.get_features(x, reuse=False) # features: [None, None, feat_dim] self.features = unflatten_first_dim(self.flat_features, sh) # two head NN; pdparam is the params for pdtype # vpred outputs the estimated value with tf.variable_scope(scope, reuse=False): # activ = tf.nn.relu x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)