def test_build_mlp(): import utils input_layer = tf.placeholder(tf.float32, (None, 10)) new_layer = utils.build_mlp(input_layer, 5, "test", n_layers=2, hidden_dim=500, activation=tf.nn.relu, output_activation=None, reuse=False) print(new_layer) print('---------------------') reuse_layer = utils.build_mlp(input_layer, 5, "test", n_layers=2, hidden_dim=500, activation=tf.nn.relu, output_activation=None, reuse=True) print(reuse_layer) print(new_layer == reuse_layer) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) org, reuse = sess.run( [new_layer, reuse_layer], feed_dict={input_layer: np.atleast_2d(np.arange(0, 10))}) print('org: ', org) print('reuse: ', reuse)
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE # REMEMBER: use a new variable such as 'normalized_state' instead of reuse variable 'state', OTHERWISE, it won't work (maybe because of no gradient update to it) normalized_state = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std) normalized_action = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std) normalized_state_dif = utils.build_mlp(tf.concat([normalized_state, normalized_action], axis=1), self._state_dim, scope="dynamics_func", n_layers=self._nn_layers, reuse=reuse) next_state_pred = state + utils.unnormalize(normalized_state_dif, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std) return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE state_norm = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std) action_norm = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std) x = tf.concat([state_norm, action_norm], axis=1) x = utils.build_mlp(x, self._state_dim, 'dynamic_net', self._nn_layers, reuse=reuse) next_state_pred = state + utils.unnormalize( x, self._init_dataset.state_mean, self._init_dataset.state_std) return next_state_pred
def _dynamics_func(self, state, action, reuse=True): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ print("ModelBasedPolicy - before _dynamics_func") ### PROBLEM 1 ############## ### part a ###Normalize state and action by using statistics of self._init_dataset ############## s_mean = self._init_dataset.state_mean a_mean = self._init_dataset.action_mean s_std = self._init_dataset.state_std a_std = self._init_dataset.action_std state_d_m = self._init_dataset.delta_state_mean state_d_s = self._init_dataset.delta_state_std normalize_state = utils.normalize(state, s_mean, s_std) normalize_action = utils.normalize(action, a_mean, a_std) ############## ### part b ### Concatenate state and action ############## s_a = tf.concat([normalize_state, normalize_action], axis=1) # s_a = np.concatenate((normalize_state,normalize_action),axis = None) # d = s_a.shape ############## ### part c ### Generate NN ############## # print ("generate the NN") # s_a_placeholder = tf.placeholder(tf.float32, [None,d[0]]) print("generate the NN") next_state_prediction = utils.build_mlp(s_a, self._state_dim, "nn_prediciton", n_layers=self._nn_layers, reuse=reuse) print("finish gen NN") ############## ### pard d ### ############## delta_state = utils.unnormalize(next_state_prediction, state_d_m, state_d_s) next_state_pred = state + delta_state print("ModelBasedPolicy - after _dynamics_func") return next_state_pred
def dynamics_func(self, state, action, reuse): """ takes state and action, returns the next state returns: prediction of next state """ state_norm = utils.normalize(state, self.init_dataset.state_mean, self.init_dataset.state_std) action_norm = utils.normalize(action, self.init_dataset.action_mean, self.init_dataset.action_std) # network input is concatenated state, action s_a = tf.concat([state_norm, action_norm], axis=1) d_next_state_norm = utils.build_mlp(s_a, self.state_dim, 'prediction', self.nn_layers, reuse=reuse) d_next_state = utils.unnormalize(d_next_state_norm, self.init_dataset.delta_state_mean, self.init_dataset.delta_state_std) next_state_pred = d_next_state + state return next_state_pred
def add_prediction_op(self): pred = [ build_mlp(input_placeholder=self.train_inputs[action_id], output_size=2, scope='nndym-%s-action-%d' % (self.name, action_id), **self.mlp_params) for action_id in range(self.env_conf['action_type_size']) ] return pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE # self._dynamics_func is supposed to take in a batch. The input state is assumed to be [None, self._state_dim]. state_mean, state_std = self._init_dataset.state_mean, self._init_dataset.state_std action_mean, action_std = self._init_dataset.action_mean, self._init_dataset.action_std normalized_state = utils.normalize(state, mean=state_mean, std=state_std) normalized_action = utils.normalize(action, mean=action_mean, std=action_std) input_nn = tf.concat(values=[normalized_state, normalized_action], axis=1) #tf.concat([normalized_state, normalized_action], axis=-1) # np.concatenate(normalized_state, normalized_action) # print("unnormlaized state: ", state) # print("state: ", normalized_state.shape) # print("state: ", normalized_state) # print("action: ", normalized_action.shape) # print("type: ", type(normalized_action)) # print("input_nn: : ", input_nn) normalized_delta_state_pred = utils.build_mlp( input_layer=input_nn, output_dim=self._state_dim, scope="dynamics_model", n_layers=self._nn_layers, reuse=reuse) delta_state_pred = utils.unnormalize( normalized_delta_state_pred, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std) next_state_pred = tf.add(state, delta_state_pred) return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE ## (a) Normalize both the state and action by using the statistics of self._init_dataset and ## the utils.normalize function state_mean = self._init_dataset.state_mean state_std = self._init_dataset.state_std state_normalize = utils.normalize(state, state_mean, state_std, eps=1e-8) # # @@@@@@ # print("state", tf.convert_to_tensor(state).get_shape()) # print("action", tf.convert_to_tensor(action).get_shape()) # state (?, 20) # action (?, 6) action_mean = self._init_dataset.action_mean action_std = self._init_dataset.action_std action_normalize = utils.normalize(action, action_mean, action_std, eps=1e-8) ## (b) Concatenate the normalized state and action concatenated = tf.concat([state_normalize, action_normalize], axis=1, name='concatenated') # (c) Pass the concatenated, normalized state-action tensor through a neural network with # self._nn_layers number of layers using the function utils.build_mlp. The resulting output # is the normalized predicted difference between the next state and the current state next_state = utils.build_mlp(input_layer=concatenated, output_dim=self._state_dim, scope='dynamics_func', n_layers=self._nn_layers, reuse=reuse) # (d) Unnormalize the delta state prediction, and add it to the current state in order to produce # the predicted next state delta_state_mean = self._init_dataset.delta_state_mean delta_state_std = self._init_dataset.delta_state_std next_state_pred = state + utils.unnormalize( next_state, delta_state_mean, delta_state_std) return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE #print(type(state) norm_state = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std) norm_action = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std) norm_all = tf.concat((norm_state, norm_action), axis=1) # st_ph,ac_ph,nx_st_ph = self._setup_placeholders() dy_func = utils.build_mlp(norm_all, self._state_dim, scope="DYN_FUNC", n_layers=self._nn_layers, reuse=reuse) # dy_out = dy_func(norm_all) #self.sess.run(dy_func,feed_dict={norm_all:norm_all}) dy_out = utils.unnormalize(dy_func, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std) next_state_pred = dy_out + state #raise NotImplementedError return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ normalized_state = utils.normalize(state, mean=self._init_dataset.state_mean, std=self._init_dataset.state_std) normalized_action = utils.normalize( action, mean=self._init_dataset.action_mean, std=self._init_dataset.action_std) normalized_state_action = tf.concat( [normalized_state, normalized_action], axis=1) normalized_state_delta = utils.build_mlp( input_layer=normalized_state_action, output_dim=self._state_dim, scope='dynamics_func', n_layers=self._nn_layers, reuse=reuse) state_delta = utils.unnormalize( normalized_state_delta, mean=self._init_dataset.delta_state_mean, std=self._init_dataset.delta_state_std) next_state_pred = state + state_delta return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE # convert to float32 state = tf.dtypes.cast(state, dtype=tf.float32) action = tf.dtypes.cast(action, dtype=tf.float32) mean_state = self._init_dataset.state_mean std_state = self._init_dataset.state_std state_norm = utils.normalize(state, mean_state, std_state) mean_action = self._init_dataset.action_mean std_action = self._init_dataset.action_std action_norm = utils.normalize(action, mean_action, std_action) state_action = tf.concat([state_norm, action_norm], axis=1) delta_state_prediction_norm = utils.build_mlp(state_action, self._state_dim, 'state_prediction', n_layers=self._nn_layers, reuse=reuse) delta_mean_state = self._init_dataset.delta_state_mean delta_std_state = self._init_dataset.delta_state_std delta_state_prediction = utils.unnormalize(delta_state_prediction_norm, delta_mean_state, delta_std_state) next_state_pred = state + delta_state_prediction return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE #print(self._init_dataset.shape) # Normalize state and action #d_mean, d_std = np.mean(self._init_dataset), np.std(self._init_dataset) n_state = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std) n_action = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std) # Predict next state based on the current state and action n_sa = tf.concat((n_state, n_action), axis=1) n_nsp = utils.build_mlp(n_sa, output_dim=self._state_dim, scope="f_transition", n_layers=self._nn_layers, reuse=reuse) next_state_pred = state + utils.unnormalize( n_nsp, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std) return next_state_pred
def _dynamics_func(self, state, action, reuse=False): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError norm_state = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std) norm_action = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std) norm_state_action = tf.concat([norm_state, norm_action], 1) norm_delta_state_pred = utils.build_mlp(norm_state_action, self._state_dim, "dynamic", n_layers=self._nn_layers, reuse=reuse) delta_state_pred = utils.unnormalize( norm_delta_state_pred, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std) next_state_pred = state + delta_state_pred return next_state_pred
def _dynamics_func(self, state, action, reuse): """ Takes as input a state and action, and predicts the next state returns: next_state_pred: predicted next state implementation details (in order): (a) Normalize both the state and action by using the statistics of self._init_dataset and the utils.normalize function (b) Concatenate the normalized state and action (c) Pass the concatenated, normalized state-action tensor through a neural network with self._nn_layers number of layers using the function utils.build_mlp. The resulting output is the normalized predicted difference between the next state and the current state (d) Unnormalize the delta state prediction, and add it to the current state in order to produce the predicted next state """ ### PROBLEM 1 ### YOUR CODE HERE state_ = utils.normalize(x=state, mean=self._init_dataset.state_mean, std=self._init_dataset.state_std) action_ = utils.normalize(x=action, mean=self._init_dataset.action_mean, std=self._init_dataset.action_std) input_ = tf.concat(values=[state_, action_], axis=-1) residual = utils.build_mlp(input_layer=input_, output_dim=self._state_dim, scope="dynamics", n_layers=self._nn_layers, reuse=reuse) residual = utils.unnormalize(x=residual, mean=self._init_dataset.delta_state_mean, std=self._init_dataset.delta_state_std) next_state_pred = state + residual return next_state_pred
def _dynamics_func(self, state, action, reuse): # get dataset statistics state_std = self._init_dataset.state_std state_mean = self._init_dataset.state_mean action_std = self._init_dataset.action_std action_mean = self._init_dataset.action_mean delta_state_std = self._init_dataset.delta_state_std delta_state_mean = self._init_dataset.delta_state_mean # normalize input data state_norm = utils.normalize(state, state_mean, state_std) action_norm = utils.normalize(action, action_mean, action_std) # perform delta prediction inp = tf.concat([state_norm, action_norm], 1) out = utils.build_mlp(inp, self._state_dim, 'policy', reuse=reuse) # perdict next state next_state_pred = state + utils.unnormalize(out, delta_state_mean, delta_state_std) return next_state_pred
def __init__(self, input_states, taken_actions, action_space, scope_name, initial_std=0.4, initial_mean_factor=0.1, pi_hidden_sizes=(500, 300), vf_hidden_sizes=(500, 300)): """ input_states [batch_size, width, height, depth]: Input images to predict actions for taken_actions [batch_size, num_actions]: Placeholder of taken actions for training action_space (gym.spaces.Box): Continous action space of our agent scope_name (string): Variable scope name for the policy graph initial_std (float): Initial value of the std used in the gaussian policy initial_mean_factor (float): Variance scaling factor for the action mean prediction layer pi_hidden_sizes (list): List of layer sizes used to construct action predicting MLP vf_hidden_sizes (list): List of layer sizes used to construct value predicting MLP """ num_actions, action_min, action_max = action_space.shape[ 0], action_space.low, action_space.high with tf.variable_scope(scope_name): # Policy branch π(a_t | s_t; θ) self.pi = build_mlp(input_states, hidden_sizes=pi_hidden_sizes, activation=tf.nn.relu, output_activation=tf.nn.relu) self.action_mean = tf.layers.dense( self.pi, num_actions, activation=tf.nn.tanh, kernel_initializer=tf.initializers.variance_scaling( scale=initial_mean_factor), name="action_mean") self.action_mean = action_min + ( (self.action_mean + 1) / 2) * (action_max - action_min) self.action_logstd = tf.Variable(np.full((num_actions), np.log(initial_std), dtype=np.float32), name="action_logstd") # Value branch V(s_t; θ) if vf_hidden_sizes is None: self.vf = self.pi # Share features if None else: self.vf = build_mlp(input_states, hidden_sizes=vf_hidden_sizes, activation=tf.nn.relu, output_activation=tf.nn.relu) self.value = tf.squeeze(tf.layers.dense(self.vf, 1, activation=None, name="value"), axis=-1) # Create graph for sampling actions self.action_normal = tfp.distributions.Normal( self.action_mean, tf.exp(self.action_logstd), validate_args=True) self.sampled_action = tf.squeeze(self.action_normal.sample(1), axis=0) # Clip action space to min max self.sampled_action = tf.clip_by_value(self.sampled_action, action_min, action_max) # Get the log probability of taken actions # log π(a_t | s_t; θ) self.action_log_prob = tf.reduce_sum( self.action_normal.log_prob(taken_actions), axis=-1, keepdims=True)
def build_policy_network(self): sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32) if self.discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) if self.discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, self.ac_dim, 'policy_network', n_layers=self.n_layers, size=self.size) sy_prob_na = tf.nn.softmax(sy_logits_na) # print('sy_logits_na', sy_logits_na) sy_sampled_ac = tf.multinomial( sy_logits_na, 1) # Hint: Use the tf.multinomial op sy_sampled_ac = tf.reshape(sy_sampled_ac, [-1]) # print('sy_sampled_ac', sy_sampled_ac) sy_ac_onehot_na = tf.one_hot(sy_ac_na, depth=self.ac_dim) # print('sy_ac_onehot_na', sy_ac_onehot_na) sy_ac_responseprob_na = tf.reduce_mean(tf.multiply( sy_ac_onehot_na, sy_prob_na), axis=1) # print('sy_ac_responseprob_na', sy_ac_responseprob_na) sy_logprob_n = tf.log(sy_ac_responseprob_na) # print('sy_logprob_n', sy_logprob_n) else: # YOUR_CODE_HERE # Parameterize the gaussian policy by mean and std self.mpc_a = tf.placeholder(shape=[None, self.ac_dim], name="mpc_a", dtype=tf.float32) net = sy_ob_no net = tf.layers.dense(net, 64, activation=tf.tanh) net = tf.layers.dense(net, 64, activation=tf.tanh) net = tf.concat([net, self.mpc_a], axis=1) sy_mean_na = tf.layers.dense(net, self.ac_dim, activation=None) # gate = tf.Variable(tf.ones([1, self.ac_dim])) # sy_mean_na = gate * self.mpc_a + (1-gate) * sy_mean_na # sy_mean_na = self.mpc_a self.sy_logstd = tf.Variable( tf.zeros([1, self.ac_dim]), name='action/logstd', dtype=tf.float32 ) # logstd should just be a trainable variable, not a network output. self.sy_std = tf.exp(self.sy_logstd) # print('self.sy_std', self.sy_std) # Sample an action sy_sampled_ac = sy_mean_na + self.sy_std * tf.random_normal( tf.shape(sy_mean_na)) # print('sy_sampled_ac', sy_sampled_ac) # Log likely hood of chosen this action # Hint: Use the log probability under a multivariate gaussian. sy_z = (sy_ac_na - sy_mean_na) / self.sy_std # print('sy_z', sy_z) sy_logprob_n = -0.5 * tf.square(sy_z) - 0.5 * tf.log( tf.constant(2 * np.pi)) - self.sy_logstd sy_logprob_n = tf.reduce_sum(sy_logprob_n, axis=1) # print('sy_logprob_n', sy_logprob_n) return sy_ob_no, sy_ac_na, sy_adv_n, sy_sampled_ac, sy_logprob_n
def __init__(self, input_states, taken_actions, num_actions, action_min, action_max, scope_name, initial_std=0.4, initial_mean_factor=0.1, pi_hidden_sizes=(500, 300), vf_hidden_sizes=(500, 300)): """ input_states [batch_size, width, height, depth]: Input images to predict actions for taken_actions [batch_size, num_actions]: Actions taken by the old policy (used for training) num_actions (int): Number of continous actions to output action_min [num_actions]: Minimum possible value for the respective action action_max [num_actions]: Maximum possible value for the respective action scope_name (string): Variable scope name for the policy graph initial_mean_factor (float): Variance scaling factor for the action mean prediction layer """ with tf.variable_scope(scope_name): # Policy branch π(a_t | s_t; θ) self.pi = build_mlp(input_states, hidden_sizes=pi_hidden_sizes, activation=tf.nn.relu, output_activation=tf.nn.relu) self.action_mean = tf.layers.dense( self.pi, num_actions, activation=tf.nn.tanh, kernel_initializer=tf.initializers.variance_scaling( scale=initial_mean_factor), name="action_mean") self.action_mean = action_min + ( (self.action_mean + 1) / 2) * (action_max - action_min) self.action_logstd = tf.Variable(np.full((num_actions), np.log(initial_std), dtype=np.float32), name="action_logstd") # Value branch V(s_t; θ) if vf_hidden_sizes is None: self.vf = self.pi # Share features if None else: self.vf = build_mlp(input_states, hidden_sizes=vf_hidden_sizes, activation=tf.nn.relu, output_activation=tf.nn.relu) self.value = tf.squeeze(tf.layers.dense(self.vf, 1, activation=None, name="value"), axis=-1) # Create graph for sampling actions self.action_normal = tfp.distributions.Normal( self.action_mean, tf.exp(self.action_logstd), validate_args=True) self.sampled_action = tf.squeeze(self.action_normal.sample(1), axis=0) # Clip action space to min max self.sampled_action = tf.clip_by_value(self.sampled_action, action_min, action_max) # Get the log probability of taken actions # log π(a_t | s_t; θ) self.action_log_prob = tf.reduce_sum( self.action_normal.log_prob(taken_actions), axis=-1, keepdims=True)