def test_actor_critic_continuous_policy(self): batch_size = 100 steps_per_episode = 13 env = PolicyUnittestEnv(batch_size, steps_per_episode, action_type=ActionType.Continuous) # We need to wrap env using TFPyEnvironment because the methods of env # has side effects (e.g, env._current_time_step can be changed) env = TFPyEnvironment(env) action_spec = env.action_spec() observation_spec = env.observation_spec() algorithm = ActorCriticAlgorithm( observation_spec=observation_spec, action_spec=action_spec, actor_network=ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=()), value_network=ValueNetwork(observation_spec, fc_layer_params=()), optimizer=tf.optimizers.Adam(learning_rate=1e-2)) driver = OnPolicyDriver(env, algorithm, train_interval=2) eval_driver = OnPolicyDriver(env, algorithm, training=False) driver.run = tf.function(driver.run) t0 = time.time() driver.run(max_num_steps=2600 * batch_size) print("time=%s" % (time.time() - t0)) env.reset() time_step, _ = eval_driver.run(max_num_steps=4 * batch_size) print("reward=%s" % tf.reduce_mean(time_step.reward)) self.assertAlmostEqual(1.0, float(tf.reduce_mean(time_step.reward)), delta=5e-2)
def create_algorithm(env, use_rnn=False, learning_rate=1e-1): observation_spec = env.observation_spec() action_spec = env.action_spec() if use_rnn: actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) value_net = ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) else: actor_net = ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=()) value_net = ValueNetwork(observation_spec, fc_layer_params=()) optimizer = tf.optimizers.Adam(learning_rate=learning_rate) ac_algorithm = ActorCriticAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss=PPOLoss(action_spec=action_spec, gamma=1.0), optimizer=optimizer) return PPOAlgorithm(ac_algorithm)
def create_algorithm(env, use_rnn=False, learning_rate=1e-1): observation_spec = env.observation_spec() action_spec = env.action_spec() if use_rnn: actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) value_net = ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) else: actor_net = ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=(), continuous_projection_net=StableNormalProjectionNetwork) value_net = ValueNetwork(observation_spec, fc_layer_params=()) optimizer = tf.optimizers.Adam(learning_rate=learning_rate) return PPOAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss=PPOLoss(action_spec=action_spec, gamma=1.0, debug_summaries=DEBUGGING), optimizer=optimizer, debug_summaries=DEBUGGING)
def build_value_net(observation_space): fc_layesr = [7 * 7 * 64, 512] q_net = ValueNetwork(observation_space, fc_layer_params=fc_layesr, dropout_layer_params=[1, 3]) return q_net
def train_eval( # tensorboard files root_dir, # environment env_name="CartPole-v1", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) actor_net = ActorDistributionNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(200, 100)) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=(200, 100)) global_step = tf.compat.v1.train.get_or_create_global_step() agent = PPOClipAgent( # should be closer to the paper than PPOAgent... environment.time_step_spec(), environment.action_spec(), optimizer=tf.compat.v1.train.AdamOptimizer( ), # default None does not work actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, lambda_value=0.5, discount_factor=0.95, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(400) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
def _create_ac_algorithm(): observation_spec = common.get_observation_spec() action_spec = common.get_action_spec() optimizer = tf.optimizers.Adam(learning_rate=5e-5) actor_net = ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=(8, )) value_net = ValueNetwork(observation_spec, fc_layer_params=(8, )) return ActorCriticAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss_class=ActorCriticLoss, optimizer=optimizer, debug_summaries=True)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, transition_model_type: TransitionModelType, num_hidden_layers_model: int, num_hidden_nodes_model: int, activation_function_model: Callable, ensemble_size: int, predict_state_difference: bool, epochs: int, training_batch_size: int, callbacks: List[tf.keras.callbacks.Callback], reward_model: RewardModel, initial_state_distribution_model: InitialStateDistributionModel, trajectory_sampler_type: TrajectorySamplerType, horizon: int, population_size: int, model_free_agent_type: ModelFreeAgentType, num_hidden_layers_agent: int, num_hidden_nodes_agent: int, activation_function_agent: Callable, model_free_training_iterations: int, debug_summaries: bool = False, train_step_counter: Optional[tf.Variable] = None, ): """ Initializes the agent :param time_step_spec: A nest of tf.TypeSpec representing the time_steps. :param action_spec: A nest of BoundedTensorSpec representing the actions. :param transition_model_type: An indicator which of the available transition models should be used - list can be found in `TransitionModelType`. A component of the environment model that describes the transition dynamics. :param num_hidden_layers_model: A transition model parameter, used for constructing a neural network. A number of hidden layers in the neural network. :param num_hidden_nodes_model: A transition model parameter, used for constructing a neural network. A number of nodes in each hidden layer. Parameter is shared across all layers. :param activation_function_model: A transition model parameter, used for constructing a neural network. An activation function of the hidden nodes. :param ensemble_size: A transition model parameter, used for constructing a neural network. The number of networks in the ensemble. :param predict_state_difference: A transition model parameter, used for constructing a neural network. A boolean indicating whether transition model will be predicting a difference between current and a next state or the next state directly. :param epochs: A transition model parameter, used by Keras fit method. A number of epochs used for training the neural network. :param training_batch_size: A transition model parameter, used by Keras fit method. A batch size used for training the neural network. :param callbacks: A transition model parameter, used by Keras fit method. A list of Keras callbacks used for training the neural network. :param reward_model: A component of the environment model that describes the rewards. At the moment only pre-specified reward models are allowed, i.e. agent assumes reward function is known. :param initial_state_distribution_model: A component of the environment model that describes the initial state distribution (can be both deterministic or probabilistic). At the moment only pre-specified initial state distribution models are allowed, i.e. agent assumes initial state distribution is known. :param trajectory_sampler_type: An indicator which of the available trajectory samplers should be used - list can be found in `TrajectorySamplerType`. Trajectory sampler determines how predictions from an ensemble of neural networks that model the transition dynamics are sampled. Works only with ensemble type of transition models. :param horizon: A trajectory optimiser parameter. The number of steps taken in the environment in each virtual rollout. :param population_size: A trajectory optimiser parameter. The number of virtual rollouts that are simulated in each iteration during trajectory optimization. :param model_free_agent_type: Type of model-free agent, e.g. PPO or TRPO. :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of hidden layers in the neural network. :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of nodes in each hidden layer. Parameter is shared across all layers. :param activation_function_agent: A model-free agent parameter, used for constructing a neural network. An activation function of the hidden nodes. :param model_free_training_iterations: Number of model-free training iterations per each train-call. :param debug_summaries: A bool; if true, subclasses should gather debug summaries. :param train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. """ assert ensemble_size > 0, "ensemble_size must be an integer > 0" assert num_hidden_layers_agent >= 0 if num_hidden_layers_agent > 0: assert num_hidden_nodes_agent > 0 self._ensemble_size = ensemble_size observation_spec = time_step_spec.observation # trajectory sampler (meaningful only for ensemble models) trajectory_sampler: Optional[TrajectorySamplingStrategy] = None if transition_model_type in [ TransitionModelType.DeterministicEnsemble, TransitionModelType.ProbabilisticEnsemble, ]: trajectory_sampler = build_trajectory_sampler_from_type( ensemble_size=ensemble_size, trajectory_sampler_type=trajectory_sampler_type, batch_size=population_size, ) # transition dynamics model plus training spec transition_model, training_spec = build_transition_model_and_training_spec_from_type( observation_spec=observation_spec, action_spec=action_spec, transition_model_type=transition_model_type, num_hidden_layers=num_hidden_layers_model, num_hidden_nodes=num_hidden_nodes_model, activation_function=activation_function_model, ensemble_size=ensemble_size, predict_state_difference=predict_state_difference, epochs=epochs, training_batch_size=training_batch_size, callbacks=callbacks, trajectory_sampler=trajectory_sampler, ) # model-free agent actor_net = ActorDistributionNetwork( input_tensor_spec=observation_spec, output_tensor_spec=action_spec, fc_layer_params=[num_hidden_nodes_agent] * num_hidden_layers_agent, activation_fn=activation_function_agent, ) value_net = ValueNetwork( input_tensor_spec=observation_spec, fc_layer_params=[num_hidden_nodes_agent] * num_hidden_layers_agent, activation_fn=activation_function_agent, ) if model_free_agent_type == ModelFreeAgentType.Ppo: model_free_agent = PPOClipAgent( # the one normally used for experiments... time_step_spec=time_step_spec, action_spec=action_spec, actor_net=actor_net, value_net=value_net, optimizer=tf.compat.v1.train.AdamOptimizer( ), # default None does not work... ) elif model_free_agent_type == ModelFreeAgentType.Trpo: model_free_agent = TRPOAgent( time_step_spec=time_step_spec, action_spec=action_spec, actor_net=actor_net, value_net=value_net, ) else: raise RuntimeError("Unknown or unsupported agent type") super().__init__( (transition_model, training_spec), reward_model, initial_state_distribution_model, model_free_agent, population_size, horizon, model_free_training_iterations, debug_summaries, train_step_counter, )
from tf_agents.agents.ppo import ppo_agent from tf_agents.networks.value_network import ValueNetwork actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=actor_fc_layer_params) value_net = ValueNetwork(train_env.observation_spec()) global_step = tf.compat.v2.Variable(0) tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(), train_env.action_spec(), actor_net=actor_net, value_net=value_net, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=actor_learning_rate), train_step_counter=global_step)
continuous_projection_net=tanh_normal_projection_network.TanhNormalProjectionNetwork, name='ActorDistributionNetwork' ) print('Actor Network Created.') # Critic Network critic_net = ValueNetwork( observation_spec, preprocessing_layers={ 'observation_market': kr.models.Sequential([ kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)), kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)), kr.layers.Flatten(), kr.layers.Dense(5, activation='tanh'), kr.layers.Flatten() ]), # 'observation_market': kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)), 'observation_holdingRate': kr.layers.Dense(2, activation='tanh') }, preprocessing_combiner=kr.layers.Concatenate(axis=-1), conv_layer_params=None, fc_layer_params=critic_commonDenseLayerParams, dtype=tf.float32, name='Critic Network' ) # Agent # https://www.tensorflow.org/agents/api_docs/python/tf_agents/agents/ReinforceAgent global_step = tf.compat.v1.train.get_or_create_global_step() if shouldContinueFromLastCheckpoint: global_step = tf.compat.v1.train.get_global_step()
def create_ac_algorithm(env, actor_fc_layers=(200, 100), value_fc_layers=(200, 100), encoding_conv_layers=(), encoding_fc_layers=(), use_rnns=False, use_icm=False, learning_rate=5e-5, algorithm_class=ActorCriticAlgorithm, loss_class=ActorCriticLoss, debug_summaries=False): """Create a simple ActorCriticAlgorithm. Args: env (TFEnvironment): A TFEnvironment actor_fc_layers (list[int]): list of fc layers parameters for actor network value_fc_layers (list[int]): list of fc layers parameters for value network encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network encoding_fc_layers (list[int]): list of fc layers parameters for encoding network use_rnns (bool): True if rnn should be used use_icm (bool): True if intrinsic curiosity module should be used learning_rate (float): learning rate algorithm_class (type): class of the algorithm. Can be ActorCriticAlgorithm or PPOAlgorithm loss_class (type): the class of the loss. The signature of its constructor: loss_class(action_spec, debug_summaries) debug_summaries (bool): True if debug summaries should be created. """ optimizer = tf.optimizers.Adam(learning_rate=learning_rate) if use_rnns: actor_net = ActorDistributionRnnNetwork( env.observation_spec(), env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = ValueRnnNetwork(env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = ActorDistributionNetwork(env.observation_spec(), env.action_spec(), fc_layer_params=actor_fc_layers) value_net = ValueNetwork(env.observation_spec(), fc_layer_params=value_fc_layers) encoding_net = None if encoding_fc_layers or encoding_conv_layers: encoding_net = EncodingNetwork( input_tensor_spec=env.observation_spec(), conv_layer_params=encoding_conv_layers, fc_layer_params=encoding_fc_layers) icm = None if use_icm: feature_spec = env.observation_spec() if encoding_net: feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ), dtype=tf.float32) icm = ICMAlgorithm(env.action_spec(), feature_spec, encoding_net=encoding_net) algorithm = algorithm_class(action_spec=env.action_spec(), actor_network=actor_net, value_network=value_net, intrinsic_curiosity_module=icm, loss_class=loss_class, optimizer=optimizer, debug_summaries=debug_summaries) return algorithm
def train_eval( # harness # tensorboard files root_dir, # Params for collect num_environment_steps, # Params for eval num_eval_episodes, eval_interval, # Params for summaries summary_interval, # environment env_name, gym_random_seed, # agent random_seed, num_hidden_layers_agent, num_hidden_nodes_agent, discount_factor, lambda_value, max_kl, backtrack_coefficient, backtrack_iters, cg_iters, reward_normalizer, reward_norm_clipping, log_prob_clipping, value_train_iters, value_optimizer, gradient_clipping, debug, # agent trainer steps_per_policy_update, # agent specific harness parameters replay_buffer_capacity, use_tf_function, ): """ This function will train and evaluate a TRPO agent. :param root_dir: Root directory where all experiments are stored. :param num_environment_steps: The number of environment steps to run the experiment for. :param num_eval_episodes: Number of episodes at each evaluation point. :param eval_interval: Interval for evaluation points. :param summary_interval: Interval for summaries. :param env_name: Name for the environment to load. :param gym_random_seed: Value to use as seed for the environment. :param random_seed: A component of the environment model that describes the rewards. At the moment only pre-specified reward models are allowed, i.e. agent assumes reward function is known. :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of hidden layers in the neural network. :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of nodes in each hidden layer. Parameter is shared across all layers. :param discount_factor: discount factor in [0, 1] :param lambda_value: trace decay used by the GAE critic in [0, 1] :param max_kl: maximum KL distance between updated and old policy :param backtrack_coefficient: coefficient used in step size search :param backtrack_iters: number of iterations to performa in line search :param cg_iters: number of conjugate gradient iterations to approximate natural gradient :param reward_normalizer: TensorNormalizer applied to rewards :param reward_norm_clipping: value to clip rewards :param log_prob_clipping: clip value for log probs in policy gradient , None for no clipping :param value_train_iters: number of gradient steps to perform on value estimator for every policy update :param value_optimizer: optimizer used to train value_function (default: Adam) :param gradient_clipping: clip born value gradient (None for no clipping) :param debug: debug flag to check computations for Nans :param steps_per_policy_update: steps between policy updates :param replay_buffer_capacity: Capacity of the buffer collecting real samples. :param use_tf_function: If `True`, use a `tf.function` for data collection. """ tf.compat.v1.set_random_seed(random_seed) environment = create_real_tf_environment(env_name, gym_random_seed) evaluation_environment = create_real_tf_environment( env_name, gym_random_seed) network_architecture = (num_hidden_nodes_agent, ) * num_hidden_layers_agent actor_net = ActorDistributionNetwork( environment.observation_spec(), environment.action_spec(), fc_layer_params=network_architecture, ) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=network_architecture) global_step = tf.compat.v1.train.get_or_create_global_step() agent = TRPOAgent( environment.time_step_spec(), environment.action_spec(), actor_net, value_net, discount_factor, lambda_value, max_kl, backtrack_coefficient, backtrack_iters, cg_iters, reward_normalizer, reward_norm_clipping, log_prob_clipping, value_train_iters, value_optimizer, gradient_clipping, debug, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(steps_per_policy_update) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=use_tf_function, ) experiment_harness.run()
def __init__(self, observation_spec, action_spec, memory: MemoryWithUsage, num_read_keys=1, lstm_size=(256, 256), latent_dim=200, loss=None, loss_weight=1.0, name="mba"): """Create the policy module of MERLIN. Args: action_spec (nested BoundedTensorSpec): representing the actions. memory (MemoryWithUsage): the memory module from MemoryBasedPredictor num_read_keys (int): number of keys for reading memory. latent_dim (int): the dimension of the hidden representation of VAE. lstm_size (list[int]): size of lstm layers loss (None|ActorCriticLoss): an object for calculating the loss for reinforcement learning. If None, a default ActorCriticLoss will be used. name (str): name of the algorithm. """ # This is different from Merlin LSTM. This rnn only uses the output # from ths last LSTM layer, while Merlin uses outputs from all LSTM # layers rnn = make_lstm_cell(lstm_size, name=name + "/lstm") actor_input_dim = (latent_dim + lstm_size[-1] + num_read_keys * memory.dim) actor_net = ActorDistributionNetwork( input_tensor_spec=TensorSpec((actor_input_dim, ), dtype=tf.float32), output_tensor_spec=action_spec, fc_layer_params=(200, ), activation_fn=tf.keras.activations.tanh, name=name + "/actor_net") super(MemoryBasedActor, self).__init__(observation_spec=observation_spec, action_spec=action_spec, train_state_spec=get_rnn_cell_state_spec(rnn), name=name) self._loss = ActorCriticLoss(action_spec) if loss is None else loss self._loss_weight = loss_weight self._memory = memory self._key_net = tf.keras.layers.Dense(num_read_keys * (self._memory.dim + 1), name=name + "/key_net") # TODO: add log p(a_i) as input to value net value_input_dim = latent_dim self._value_net = ValueNetwork(input_tensor_spec=TensorSpec( (value_input_dim, ), dtype=tf.float32), fc_layer_params=(200, ), activation_fn=tf.keras.activations.tanh, name=name + "/value_net") self._rnn = rnn self._actor_net = actor_net
def make_networks(env, size=(96, 96), num_frames=1, num_channels=3, conv_params=[(16, 8, 4), (32, 3, 2)], in_fc_params=(256, ), out_fc_params=(128, ), use_lstm=False, lstm_size=(256, )): """ Creates the actor and critic neural networks of the PPO agent. Function for creating the neural networks for the PPO agent, namely the actor and value networks. Source for network params: https://www.arconsis.com/unternehmen/blog/reinforcement-learning-doom-with-tf-agents-and-ppo Arguments: env (TfPyEnvironment): A TensorFlow environment the agent interacts with. size (tuple): The desired width and height of the observation space. Defaults to (96, 96). Input tuple should preserve the original observation aspect ratio. num_frames (int): Number of frames used in the agent's observation. Defaults to 1, num_frames > 1 indicates frame stacking. num_channels (int): Number of color channels to include for each frame. Defaults to 3 (RGB), and 1 denotes grayscale. conv_params (list): A list corresponding to convolutional layer parameters for the PPO agent's actor and critic neural networks. in_fc_params (tuple): The number of neurons in the input fully connected layer of the actor and critic networks of the agent. out_fc_params (tuple): The number of neurons in the output fully connected layer of the actor and critic networks of the agent. use_lstm (bool): Whether to use LSTM-based actor and critic networks. lstm_size (tuple): The number of hidden states inside the LSTM for the actor and critic networks of the agents. Returns: actor_net (ActorDistributionNetwork): A tf-agents Actor Distribution Network used for PPO agent action selection. value_net (ValueNetwork): A tf-agents Value Network used for PPO agent value estimation. """ # Restructure time step spec to match expected processed observations processed_shape = tuple(size + (num_channels * num_frames, )) obs_spec = env.observation_spec() # Get old observation spec obs_spec = tensor_spec.BoundedTensorSpec(processed_shape, obs_spec.dtype, minimum=obs_spec.minimum, maximum=obs_spec.maximum, name=obs_spec.name) if use_lstm: # LSTM-based policies # Define actor network actor_net = ActorDistributionRnnNetwork( obs_spec, env.action_spec(), conv_layer_params=conv_params, input_fc_layer_params=in_fc_params, lstm_size=lstm_size, output_fc_layer_params=out_fc_params) # Define value network value_net = ValueRnnNetwork(obs_spec, conv_layer_params=conv_params, input_fc_layer_params=in_fc_params, lstm_size=lstm_size, output_fc_layer_params=out_fc_params) print("Created Actor and Value Networks with LSTM...") else: # non-LSTM-based policies # Define actor network actor_net = ActorDistributionNetwork(obs_spec, env.action_spec(), conv_layer_params=conv_params) # Define value network value_net = ValueNetwork(obs_spec, conv_layer_params=conv_params) return actor_net, value_net
eval_env = TFPyEnvironment(eval_py_env) train_env = TFPyEnvironment( ParallelPyEnvironment([get_env] * 4, start_serially=False)) # Create a global step global_step = tf.compat.v1.train.get_or_create_global_step() # Create the actor network (with the normal distribution) actor_net = ActorDistributionNetwork( input_tensor_spec=train_env.observation_spec(), output_tensor_spec=train_env.action_spec(), fc_layer_params=(128, 256, 512, 512, 256), continuous_projection_net=normal_net) # Create the value network value_net = ValueNetwork(input_tensor_spec=train_env.observation_spec(), fc_layer_params=(256, 512, 512)) # Create the PPO agent ppo_agent = PPOClipAgent(time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), optimizer=Adam(learning_rate=5e-4), actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, discount_factor=0.95, entropy_regularization=0.0, num_epochs=16, use_gae=True, use_td_lambda_return=True, log_prob_clipping=3, gradient_clipping=0.5,