def actor(self, pre_processing): return actor_distribution_network.ActorDistributionNetwork( self.observation_spec, self.action_spec, preprocessing_layers=pre_processing, fc_layer_params=self.actor_fc_layer_params, continuous_projection_net=self.normal_projection_net)
def get_actor_and_value_network(action_spec, observation_spec): preprocessing_layers = tfk.Sequential([ tfk.layers.Lambda(lambda x: x - 0.5), # Normalization tfk.layers.MaxPooling2D((5, 5), strides=(5, 5)), tfk.layers.Conv2D(256, (11, 3), (1, 1), padding='valid', activation='relu'), tfk.layers.Reshape((-1, 256)), tfk.layers.Conv1D(128, 1, activation='relu'), tfk.layers.Flatten() ]) actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, preprocessing_layers=preprocessing_layers, fc_layer_params=(200, 100), activation_fn=tfk.activations.relu) value_net = value_network.ValueNetwork( observation_spec, preprocessing_layers=preprocessing_layers, fc_layer_params=(200, 100), activation_fn=tfk.activations.relu) return actor_net, value_net
def testUpdateAdaptiveKlBeta(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, initial_adaptive_kl_beta=1.0, adaptive_kl_target=10.0, adaptive_kl_tolerance=0.5, ) self.evaluate(tf.compat.v1.initialize_all_variables()) # When KL is target kl, beta should not change. update_adaptive_kl_beta_fn = common.function(agent.update_adaptive_kl_beta) beta_0 = update_adaptive_kl_beta_fn([10.0]) expected_beta_0 = 1.0 self.assertEqual(expected_beta_0, self.evaluate(beta_0)) # When KL is large, beta should increase. beta_1 = update_adaptive_kl_beta_fn([100.0]) expected_beta_1 = 1.5 self.assertEqual(expected_beta_1, self.evaluate(beta_1)) # When KL is small, beta should decrease. beta_2 = update_adaptive_kl_beta_fn([1.0]) expected_beta_2 = 1.0 self.assertEqual(expected_beta_2, self.evaluate(beta_2))
def GetAgent(self, env, params): actor_net = actor_distribution_network.ActorDistributionNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=tuple(self._ppo_params["ActorFcLayerParams", "", [512, 256, 256]])) value_net = value_network.ValueNetwork( env.observation_spec(), fc_layer_params=tuple(self._ppo_params["CriticFcLayerParams", "", [512, 256, 256]])) tf_agent = ppo_agent.PPOAgent( env.time_step_spec(), env.action_spec(), actor_net=actor_net, value_net=value_net, normalize_observations=self._ppo_params["NormalizeObservations", "", False], normalize_rewards=self._ppo_params["NormalizeRewards", "", False], optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self._ppo_params["LearningRate", "", 3e-4]), train_step_counter=self._ckpt.step, num_epochs=self._ppo_params["NumEpochs", "", 30], name=self._ppo_params["AgentName", "", "ppo_agent"], debug_summaries=self._ppo_params["DebugSummaries", "", False]) tf_agent.initialize() return tf_agent
def testHandlePreprocessingLayers(self): observation_spec = (tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(3, )) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3) ] preprocessing_layers = (tf.keras.layers.Dense(4), tf.keras.Sequential([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add()) action_distributions, _ = net(time_step.observation, time_step.step_type, ()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([3, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([3, 3], action_distributions[1].mode().shape.as_list()) self.assertGreater(len(net.trainable_variables), 4)
def testUpdateAdaptiveKlBeta(self): if tf.executing_eagerly(): self.skipTest('b/123777119') # Secondary bug: ('b/123770194') actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, initial_adaptive_kl_beta=1.0, adaptive_kl_target=10.0, adaptive_kl_tolerance=0.5, ) self.evaluate(tf.compat.v1.global_variables_initializer()) # When KL is target kl, beta should not change. beta_0 = agent.update_adaptive_kl_beta(10.0) self.assertEqual(self.evaluate(beta_0), 1.0) # When KL is large, beta should increase. beta_1 = agent.update_adaptive_kl_beta(100.0) self.assertEqual(self.evaluate(beta_1), 1.5) # When KL is small, beta should decrease. beta_2 = agent.update_adaptive_kl_beta(1.0) self.assertEqual(self.evaluate(beta_2), 1.0)
def testDropoutFCLayersWithConv(self, training): tf.compat.v1.set_random_seed(0) observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1,)) action_spec = tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3) net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=[5], dropout_layer_params=[0.5]) action_distributions1, _ = net( time_step.observation, time_step.step_type, (), training=training) action_distributions2, _ = net( time_step.observation, time_step.step_type, (), training=training) mode1 = action_distributions1.mode() mode2 = action_distributions2.mode() self.evaluate(tf.compat.v1.global_variables_initializer()) mode1, mode2 = self.evaluate([mode1, mode2]) if training: self.assertGreater(np.linalg.norm(mode1 - mode2), 0) else: self.assertAllEqual(mode1, mode2)
def testDropoutFCLayersWithConv(self, training): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, )) action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3) net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=[5], dropout_layer_params=[0.5]) modes = [] num_modes = 10 for _ in range(num_modes): action_distributions, _ = net(time_step.observation, time_step.step_type, (), training=training) modes.append(action_distributions.mode()) self.evaluate(tf.compat.v1.global_variables_initializer()) modes = self.evaluate(modes) modes_differ = False for i in range(num_modes): for j in range(i + 1, num_modes): modes_differ = np.linalg.norm(modes[i] - modes[j]) > 1e-6 if modes_differ: break self.assertEqual(training, modes_differ)
def testAgentTransitionTrain(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(10, ), continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001)) time_step_spec = self._time_step_spec._replace( reward=tensor_spec.BoundedTensorSpec( [], tf.float32, minimum=0.0, maximum=1.0, name='reward')) transition_spec = trajectory.Transition( time_step=time_step_spec, action_step=policy_step.PolicyStep(action=self._action_spec, state=(), info=()), next_time_step=time_step_spec) sample_trajectory_experience = tensor_spec.sample_spec_nest( transition_spec, outer_dims=(3, )) agent.train(sample_trajectory_experience)
def testAgentTrajectoryTrain(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(10, ), continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001)) trajectory_spec = trajectory.Trajectory( step_type=self._time_step_spec.step_type, observation=self._time_step_spec.observation, action=self._action_spec, policy_info=(), next_step_type=self._time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec([], tf.float32, minimum=0.0, maximum=1.0, name='reward'), discount=self._time_step_spec.discount) sample_trajectory_experience = tensor_spec.sample_spec_nest( trajectory_spec, outer_dims=(3, 2)) agent.train(sample_trajectory_experience)
def load_reinforce_agent(train_env, actor_fc_layers, learning_rate, num_epochs, preprocessing_layers=None, preprocessing_combiner=None): """ Creates a REINFORCE agent, using the same inputs as load_ppo_agent. Note that the reinforce algorithm is pure policy gradient and does not have an critic (i.e., value) network. """ optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=actor_fc_layers, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner) tf_agent = reinforce_agent.ReinforceAgent( train_env.time_step_spec(), train_env.action_spec(), actor_network=actor_net, optimizer=optimizer, normalize_returns=False, train_step_counter=train_step_counter) tf_agent.initialize() return tf_agent
def testKlCutoffLoss(self, not_zero): kl_cutoff_coef = 30.0 * not_zero actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, kl_cutoff_factor=5.0, adaptive_kl_target=0.1, kl_cutoff_coef=kl_cutoff_coef, ) kl_divergence = tf.constant([[1.5, -0.5, 6.5, -1.5, -2.3]], dtype=tf.float32) expected_kl_cutoff_loss = kl_cutoff_coef * (.24**2) # (0.74 - 0.5) ^ 2 loss = agent.kl_cutoff_loss(kl_divergence) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_ = self.evaluate(loss) self.assertAllClose([loss_], [expected_kl_cutoff_loss])
def testMasking(self): batch_size = 1000 num_state_dims = 5 num_actions = 8 observations = tf.random.uniform([batch_size, num_state_dims]) time_step = ts.restart(observations, batch_size=batch_size) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) time_step_spec = ts.time_step_spec(input_tensor_spec) action_spec = tensor_spec.BoundedTensorSpec( [1], tf.int32, 0, num_actions - 1) # We create a fixed mask here for testing purposes. Normally the mask would # be part of the observation. mask = [0, 1, 0, 1, 0, 0, 1, 0] np_mask = np.array(mask) tf_mask = tf.constant([mask for _ in range(batch_size)]) actor_network = actor_distribution_network.ActorDistributionNetwork( input_tensor_spec, action_spec, fc_layer_params=(2, 1)) policy = actor_policy.ActorPolicy( time_step_spec, action_spec, actor_network=actor_network, observation_and_action_constraint_splitter=( lambda observation: (observation, tf_mask))) # Force creation of variables before global_variables_initializer. policy.variables() self.evaluate(tf.compat.v1.global_variables_initializer()) # Sample from the policy 1000 times, and ensure that actions considered # invalid according to the mask are never chosen. action_step = policy.action(time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size, 1)) self.assertAllEqual(np_mask[action], np.ones([batch_size, 1]))
def __init__(self, strategy=None): self._strategy = strategy actor_net = actor_distribution_network.ActorDistributionNetwork( tensor_spec.TensorSpec(shape=[], dtype=tf.float32), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), fc_layer_params=(1, ), activation_fn=tf.nn.tanh) value_net = value_network.ValueNetwork(tensor_spec.TensorSpec( shape=[], dtype=tf.float32), fc_layer_params=(1, )) super(FakePPOAgent, self).__init__( time_step_spec=ts.time_step_spec( tensor_spec.TensorSpec(shape=[], dtype=tf.float32)), action_spec=tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), actor_net=actor_net, value_net=value_net, # Ensures value_prediction, return and normalized_advantage are included # as part of the training_data_spec. compute_value_and_advantage_in_train=False, update_normalizers_in_train=False, ) self.train_called_times = tf.Variable(0, dtype=tf.int32) self.experiences = []
def __init__(self): actor_net = actor_distribution_network.ActorDistributionNetwork( tensor_spec.TensorSpec(shape=[], dtype=tf.float32), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), fc_layer_params=(1, ), activation_fn=tf.nn.tanh) value_net = value_network.ValueNetwork(tensor_spec.TensorSpec( shape=[], dtype=tf.float32), fc_layer_params=(1, )) super(FakePPOAgent, self).__init__( time_step_spec=ts.time_step_spec( tensor_spec.TensorSpec(shape=[], dtype=tf.float32)), action_spec=tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), actor_net=actor_net, value_net=value_net, # Ensures value_prediction, return and advantage are included as parts # of the training_data_spec. compute_value_and_advantage_in_train=False, update_normalizers_in_train=False, ) # There is an artifical call on `_train` during the initialization which # ensures that the variables of the optimizer are initialized. This is # excluded from the call count. self.train_called_times = -1 self.experiences = []
def testHandlesExtraOuterDims(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(3, 2, 2)) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3) ] net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=(5, )) action_distributions, _ = net(time_step.observation, time_step.step_type, ()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([3, 2, 2, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([3, 2, 2, 3], action_distributions[1].mode().shape.as_list())
def create_actor_network(train_env): def projection_net_factory(action_spec): return normal_projection_network.NormalProjectionNetwork( action_spec, mean_transform=None, state_dependent_std=True, std_transform=sac_agent.std_clip_transform, scale_distribution=True) return actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), conv_layer_params=[ (4, (5, 1), 1), (4, (1, 5), 2), (8, (5, 1), 1), (8, (1, 5), 2), (16, (5, 1), 1), (16, (1, 5), 2), (32, (5, 1), 1), (32, (1, 5), 2), ], fc_layer_params=[128, 128], continuous_projection_net=projection_net_factory, )
def test_tf_agents_on_policy_agent(self): learning_rate = 1e-3 actor_fc_layers = (200, 100) value_fc_layers = (200, 100) env_name = "CartPole-v0" gym_env = gym.make(env_name) model_name = "ppo_tf_agent" train_env = environment_converter.gym_to_tf(gym_env) actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=actor_fc_layers, ) value_net = value_network.ValueNetwork(train_env.observation_spec(), fc_layer_params=value_fc_layers) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) agent = ppo_agent.PPOAgent( train_env.time_step_spec(), train_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, ) agent.initialize() # Train train(agent, gym_env, 2000, 195, model_name, 200) trained_env = get_saved_environments()[0] trained_models = get_trained_model_names(trained_env) model_saved = model_name in trained_models shutil.rmtree(save_path) self.assertTrue(model_saved)
def __init__(self): observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32) action_tensor_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) actor_net = actor_distribution_network.ActorDistributionNetwork( observation_tensor_spec, action_tensor_spec, fc_layer_params=(1, ), activation_fn=tf.nn.tanh, kernel_initializer=tf.keras.initializers.Orthogonal(seed=1), seed_stream_class=DeterministicSeedStream, seed=1) value_net = value_network.ValueNetwork(observation_tensor_spec, fc_layer_params=(1, )) super(PPOAgentActorDist, self).__init__( time_step_spec=ts.time_step_spec(observation_tensor_spec), action_spec=action_tensor_spec, actor_net=actor_net, value_net=value_net, # Ensures value_prediction, return and advantage are included as parts # of the training_data_spec. compute_value_and_advantage_in_train=True, update_normalizers_in_train=False, optimizer=tf.compat.v1.train.AdamOptimizer(), ) # There is an artifical call on `_train` during the initialization which # ensures that the variables of the optimizer are initialized. This is # excluded from the call count. self.train_called_times = -1 self.experiences = []
def test_same_actor_net_output(self): if not tf.executing_eagerly(): self.skipTest( 'Skipping test: sequential networks not supported in TF1') observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32) action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32, -1, 1) actor_net_lib = ppo_actor_network.PPOActorNetwork() actor_net_lib.seed_stream_class = DeterministicSeedStream actor_net_sequential = actor_net_lib.create_sequential_actor_net( fc_layer_units=(1, ), action_tensor_spec=action_tensor_spec, seed=1) actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork( observation_tensor_spec, action_tensor_spec, fc_layer_params=(1, ), activation_fn=tf.nn.tanh, kernel_initializer=tf.keras.initializers.Orthogonal(seed=1), seed_stream_class=DeterministicSeedStream, seed=1) sample_observation = tf.constant([[1], [2]], dtype=tf.float32) tf.random.set_seed(111) sequential_output_dist, _ = actor_net_sequential( sample_observation, step_type=ts.StepType.MID, network_state=()) tf.random.set_seed(111) actor_dist_output_dist, _ = actor_net_actor_dist( sample_observation, step_type=ts.StepType.MID, network_state=()) self.assertAllEqual(sequential_output_dist.mean(), actor_dist_output_dist.mean()) self.assertAllEqual(sequential_output_dist.stddev(), actor_dist_output_dist.stddev())
def test_same_policy_same_output(self): if not tf.executing_eagerly(): self.skipTest( 'Skipping test: sequential networks not supported in TF1') observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32) action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32, -1, 1) value_net = value_network.ValueNetwork(observation_tensor_spec, fc_layer_params=(1, )) actor_net_lib = ppo_actor_network.PPOActorNetwork() actor_net_lib.seed_stream_class = DeterministicSeedStream actor_net_sequential = actor_net_lib.create_sequential_actor_net( fc_layer_units=(1, ), action_tensor_spec=action_tensor_spec, seed=1) actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork( observation_tensor_spec, action_tensor_spec, fc_layer_params=(1, ), activation_fn=tf.nn.tanh, kernel_initializer=tf.keras.initializers.Orthogonal(seed=1), seed_stream_class=DeterministicSeedStream, seed=1) tf.random.set_seed(111) seq_policy = ppo_policy.PPOPolicy( ts.time_step_spec(observation_tensor_spec), action_tensor_spec, actor_net_sequential, value_net, collect=True) tf.random.set_seed(111) actor_dist_policy = ppo_policy.PPOPolicy( ts.time_step_spec(observation_tensor_spec), action_tensor_spec, actor_net_actor_dist, value_net, collect=True) sample_timestep = ts.TimeStep(step_type=tf.constant([1, 1], dtype=tf.int32), reward=tf.constant([1, 1], dtype=tf.float32), discount=tf.constant([1, 1], dtype=tf.float32), observation=tf.constant( [[1], [2]], dtype=tf.float32)) seq_policy_step = seq_policy._distribution(sample_timestep, policy_state=()) act_dist_policy_step = actor_dist_policy._distribution(sample_timestep, policy_state=()) seq_scale = seq_policy_step.info['dist_params']['scale_diag'] act_dist_scale = act_dist_policy_step.info['dist_params']['scale'] self.assertAllEqual(seq_scale, act_dist_scale) self.assertAllEqual(seq_policy_step.info['dist_params']['loc'], act_dist_policy_step.info['dist_params']['loc'])
def testBuild(self): actor_network = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(2, 1)) policy = actor_policy.ActorPolicy( self._time_step_spec, self._action_spec, actor_network=actor_network) self.assertEqual(policy.time_step_spec, self._time_step_spec) self.assertEqual(policy.action_spec, self._action_spec)
def get_networks(tf_env, actor_fc_layers, value_fc_layers): actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers) value_net = value_network.ValueNetwork(tf_env.observation_spec(), fc_layer_params=value_fc_layers) return actor_net, value_net
def load_ppo_agent(train_env, actor_fc_layers, value_fc_layers, learning_rate, num_epochs, preprocessing_layers=None, preprocessing_combiner=None): """ Function which creates a tensorflow agent for a given environment with specified parameters, which uses the proximal policy optimization (PPO) algorithm for training. actor_fc_layers: tuple of integers, indicating the number of units in intermediate layers of the actor network. All layers are Keras Dense layers actor_fc_layers: same for value network preprocessing_layers: already-contructed layers of the preprocessing networks, which converts observations to tensors. Needed when the observation spec is either a list or dictionary preprocessing_combiner: combiner for the preprocessing networks, typically by concatenation. learning_rate: learning rate, recommended value 0.001 or less num_epochs: number of training epochs which the agent executes per batch of collected episodes. For more details on PPO, see the documentation of tf_agents: https://github.com/tensorflow/agents/tree/master/tf_agents or the paper: https://arxiv.org/abs/1707.06347 """ optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate ) #using Adam, a learning rule which uses only first-order gradients but incorporates momentum to become approximately second-order train_step_counter = tf.compat.v2.Variable( 0) #this creates a counter that starts at 0 actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), preprocessing_combiner=preprocessing_combiner, preprocessing_layers=preprocessing_layers, fc_layer_params=actor_fc_layers, ) value_net = value_network.ValueNetwork( train_env.observation_spec(), preprocessing_combiner=preprocessing_combiner, preprocessing_layers=preprocessing_layers, fc_layer_params=value_fc_layers) tf_agent = ppo_agent.PPOAgent( train_env.time_step_spec(), train_env.action_spec(), optimizer=optimizer, actor_net=actor_net, value_net=value_net, num_epochs=num_epochs, train_step_counter=train_step_counter, normalize_rewards= False, #This is crucial to avoid the agent geting stuck normalize_observations=False, #same discount_factor=1.0, ) tf_agent.initialize( ) #This is necessary to create variables for the networks return tf_agent
def GetAgent(self, env, params): def _normal_projection_net(action_spec, init_means_output_factor=0.1): return normal_projection_network.NormalProjectionNetwork( action_spec, mean_transform=None, state_dependent_std=True, init_means_output_factor=init_means_output_factor, std_transform=sac_agent.std_clip_transform, scale_distribution=True) # actor network actor_net = actor_distribution_network.ActorDistributionNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=tuple( self._params["ML"]["BehaviorSACAgent"]["ActorFcLayerParams", "", [512, 256, 256]]), continuous_projection_net=_normal_projection_net) # critic network critic_net = critic_network.CriticNetwork( (env.observation_spec(), env.action_spec()), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=tuple(self._params["ML"]["BehaviorSACAgent"][ "CriticJointFcLayerParams", "", [512, 256, 256]])) # agent tf_agent = sac_agent.SacAgent( env.time_step_spec(), env.action_spec(), actor_network=actor_net, critic_network=critic_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self._params["ML"]["BehaviorSACAgent"][ "ActorLearningRate", "", 3e-4]), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self._params["ML"]["BehaviorSACAgent"][ "CriticLearningRate", "", 3e-4]), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self._params["ML"]["BehaviorSACAgent"][ "AlphaLearningRate", "", 3e-4]), target_update_tau=self._params["ML"]["BehaviorSACAgent"][ "TargetUpdateTau", "", 0.05], target_update_period=self._params["ML"]["BehaviorSACAgent"][ "TargetUpdatePeriod", "", 3], td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=self._params["ML"]["BehaviorSACAgent"]["Gamma", "", 0.995], reward_scale_factor=self._params["ML"]["BehaviorSACAgent"][ "RewardScaleFactor", "", 1.], train_step_counter=self._ckpt.step, name=self._params["ML"]["BehaviorSACAgent"]["AgentName", "", "sac_agent"], debug_summaries=self._params["ML"]["BehaviorSACAgent"][ "DebugSummaries", "", False]) tf_agent.initialize() return tf_agent
def __init__( self, model: flexs.Model, rounds: int, sequences_batch_size: int, model_queries_per_batch: int, starting_sequence: str, alphabet: str, log_file: Optional[str] = None, ): """Create PPO explorer.""" super().__init__( model, "PPO_Agent", rounds, sequences_batch_size, model_queries_per_batch, starting_sequence, log_file, ) self.alphabet = alphabet # Initialize tf_environment env = PPOEnv( alphabet=self.alphabet, starting_seq=starting_sequence, model=self.model, max_num_steps=self.model_queries_per_batch, ) self.tf_env = tf_py_environment.TFPyEnvironment(env) encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"]) actor_net = actor_distribution_network.ActorDistributionNetwork( self.tf_env.observation_spec(), self.tf_env.action_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) value_net = value_network.ValueNetwork( self.tf_env.observation_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) # Create the PPO agent self.agent = ppo_agent.PPOAgent( time_step_spec=self.tf_env.time_step_spec(), action_spec=self.tf_env.action_spec(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), actor_net=actor_net, value_net=value_net, num_epochs=10, summarize_grads_and_vars=False, ) self.agent.initialize()
def get_d4rl_policy(env, weights, is_dapg=False): """Creates TF Agents policy based from D4RL saved weights.""" hidden_dims = [] fc_idx = 0 while 'fc%d/weight' % fc_idx in weights: hidden_dims.append(np.shape(weights['fc0/weight'])[0]) fc_idx += 1 if is_dapg: activation_fn = tf.keras.activations.tanh continuous_projection_net = functools.partial( normal_projection_network.NormalProjectionNetwork, mean_transform=None, std_transform=tf.exp, state_dependent_std=True) else: activation_fn = tf.keras.activations.relu continuous_projection_net = functools.partial( tanh_normal_projection_network.TanhNormalProjectionNetwork, std_transform=lambda x: tf.exp(tf.clip_by_value(x, -5., 2.))) actor_net = actor_distribution_network.ActorDistributionNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=hidden_dims, continuous_projection_net=continuous_projection_net, activation_fn=activation_fn) policy = actor_policy.ActorPolicy(time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), actor_network=actor_net, training=False) # Set weights # pylint: disable=protected-access for fc_idx in range(len(hidden_dims)): actor_net._encoder.layers[fc_idx + 1].set_weights( [weights['fc%d/weight' % fc_idx].T, weights['fc%d/bias' % fc_idx]]) if is_dapg: actor_net._projection_networks.layers[0].set_weights( [weights['last_fc/weight'].T, weights['last_fc/bias']]) actor_net._projection_networks.layers[1].set_weights([ weights['last_fc_log_std/weight'].T, weights['last_fc_log_std/bias'] ]) else: actor_net._projection_networks.layers[0].set_weights([ np.concatenate( (weights['last_fc/weight'], weights['last_fc_log_std/weight']), axis=0).T, np.concatenate( (weights['last_fc/bias'], weights['last_fc_log_std/bias']), axis=0) ]) # pylint: enable=protected-access return policy
def __init__(self, env): self.env = env self.net = actor_distribution_network.ActorDistributionNetwork( self.env.observation_spec(), self.env.action_spec(), conv_layer_params=[(32, 5, 1), (64, 5, 2), (128, 5, 2), (256, 5, 2)], fc_layer_params=(64, 2)) self.optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=1e-3) self.strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)
def get_env_and_policy_from_weights(env_name: str, weights: Mapping[str, np.ndarray], n_hidden: int = 300, min_log_std: float = -5, max_log_std: float = 2): """Return tf_env and policy from dictionary of weights. Assumes that the policy has 2 hidden layers with 300 units, ReLu activations, and outputs a normal distribution squashed by a Tanh. Args: env_name: Name of the environment. weights: Dictionary of weights containing keys: fc0/weight, fc0/bias, fc0/weight, fc0/bias, last_fc/weight, last_fc_log_std/weight, last_fc/bias, last_fc_log_std/bias Returns: tf_env: TF wrapped env. policy: TF Agents policy. """ env = suites.load_mujoco(env_name) tf_env = tf_py_environment.TFPyEnvironment(env) std_transform = ( lambda x: tf.exp(tf.clip_by_value(x, min_log_std, max_log_std))) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=(n_hidden, n_hidden), continuous_projection_net=functools.partial( tanh_normal_projection_network.TanhNormalProjectionNetwork, std_transform=std_transform), activation_fn=tf.keras.activations.relu, ) policy = actor_policy.ActorPolicy( time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec(), actor_network=actor_net, training=False) # Set weights actor_net._encoder.layers[1].set_weights( # pylint: disable=protected-access [weights['fc0/weight'].T, weights['fc0/bias']]) actor_net._encoder.layers[2].set_weights( # pylint: disable=protected-access [weights['fc1/weight'].T, weights['fc1/bias']]) actor_net._projection_networks.layers[0].set_weights( # pylint: disable=protected-access [ np.concatenate( (weights['last_fc/weight'], weights['last_fc_log_std/weight']), axis=0).T, np.concatenate( (weights['last_fc/bias'], weights['last_fc_log_std/bias']), axis=0) ]) return tf_env, policy
def get_sac_policy(tf_env): actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=(256, 256), continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) policy = actor_policy.ActorPolicy(time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec(), actor_network=actor_net, training=False) return policy