예제 #1
0
 def actor(self, pre_processing):
     return actor_distribution_network.ActorDistributionNetwork(
         self.observation_spec,
         self.action_spec,
         preprocessing_layers=pre_processing,
         fc_layer_params=self.actor_fc_layer_params,
         continuous_projection_net=self.normal_projection_net)
예제 #2
0
def get_actor_and_value_network(action_spec, observation_spec):
    preprocessing_layers = tfk.Sequential([
        tfk.layers.Lambda(lambda x: x - 0.5),  # Normalization
        tfk.layers.MaxPooling2D((5, 5), strides=(5, 5)),
        tfk.layers.Conv2D(256, (11, 3), (1, 1),
                          padding='valid',
                          activation='relu'),
        tfk.layers.Reshape((-1, 256)),
        tfk.layers.Conv1D(128, 1, activation='relu'),
        tfk.layers.Flatten()
    ])

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        preprocessing_layers=preprocessing_layers,
        fc_layer_params=(200, 100),
        activation_fn=tfk.activations.relu)
    value_net = value_network.ValueNetwork(
        observation_spec,
        preprocessing_layers=preprocessing_layers,
        fc_layer_params=(200, 100),
        activation_fn=tfk.activations.relu)

    return actor_net, value_net
예제 #3
0
  def testUpdateAdaptiveKlBeta(self):
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        self._time_step_spec.observation,
        self._action_spec,
        fc_layer_params=None)
    value_net = value_network.ValueNetwork(
        self._time_step_spec.observation, fc_layer_params=None)
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=actor_net,
        value_net=value_net,
        initial_adaptive_kl_beta=1.0,
        adaptive_kl_target=10.0,
        adaptive_kl_tolerance=0.5,
    )

    self.evaluate(tf.compat.v1.initialize_all_variables())

    # When KL is target kl, beta should not change.
    update_adaptive_kl_beta_fn = common.function(agent.update_adaptive_kl_beta)
    beta_0 = update_adaptive_kl_beta_fn([10.0])
    expected_beta_0 = 1.0
    self.assertEqual(expected_beta_0, self.evaluate(beta_0))

    # When KL is large, beta should increase.
    beta_1 = update_adaptive_kl_beta_fn([100.0])
    expected_beta_1 = 1.5
    self.assertEqual(expected_beta_1, self.evaluate(beta_1))

    # When KL is small, beta should decrease.
    beta_2 = update_adaptive_kl_beta_fn([1.0])
    expected_beta_2 = 1.0
    self.assertEqual(expected_beta_2, self.evaluate(beta_2))
예제 #4
0
    def GetAgent(self, env, params):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            env.observation_spec(),
            env.action_spec(),
            fc_layer_params=tuple(self._ppo_params["ActorFcLayerParams", "",
                                                   [512, 256, 256]]))
        value_net = value_network.ValueNetwork(
            env.observation_spec(),
            fc_layer_params=tuple(self._ppo_params["CriticFcLayerParams", "",
                                                   [512, 256, 256]]))

        tf_agent = ppo_agent.PPOAgent(
            env.time_step_spec(),
            env.action_spec(),
            actor_net=actor_net,
            value_net=value_net,
            normalize_observations=self._ppo_params["NormalizeObservations",
                                                    "", False],
            normalize_rewards=self._ppo_params["NormalizeRewards", "", False],
            optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._ppo_params["LearningRate", "", 3e-4]),
            train_step_counter=self._ckpt.step,
            num_epochs=self._ppo_params["NumEpochs", "", 30],
            name=self._ppo_params["AgentName", "", "ppo_agent"],
            debug_summaries=self._ppo_params["DebugSummaries", "", False])
        tf_agent.initialize()
        return tf_agent
    def testHandlePreprocessingLayers(self):
        observation_spec = (tensor_spec.TensorSpec([1], tf.float32),
                            tensor_spec.TensorSpec([], tf.float32))
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(3, ))

        action_spec = [
            tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3),
            tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3)
        ]

        preprocessing_layers = (tf.keras.layers.Dense(4),
                                tf.keras.Sequential([
                                    tf.keras.layers.Reshape((1, )),
                                    tf.keras.layers.Dense(4)
                                ]))

        net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=tf.keras.layers.Add())

        action_distributions, _ = net(time_step.observation,
                                      time_step.step_type, ())
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertEqual([3, 2],
                         action_distributions[0].mode().shape.as_list())
        self.assertEqual([3, 3],
                         action_distributions[1].mode().shape.as_list())
        self.assertGreater(len(net.trainable_variables), 4)
예제 #6
0
    def testUpdateAdaptiveKlBeta(self):
        if tf.executing_eagerly():
            self.skipTest('b/123777119')  # Secondary bug: ('b/123770194')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._time_step_spec.observation,
            self._action_spec,
            fc_layer_params=None)
        value_net = value_network.ValueNetwork(
            self._time_step_spec.observation, fc_layer_params=None)
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=actor_net,
            value_net=value_net,
            initial_adaptive_kl_beta=1.0,
            adaptive_kl_target=10.0,
            adaptive_kl_tolerance=0.5,
        )

        self.evaluate(tf.compat.v1.global_variables_initializer())

        # When KL is target kl, beta should not change.
        beta_0 = agent.update_adaptive_kl_beta(10.0)
        self.assertEqual(self.evaluate(beta_0), 1.0)

        # When KL is large, beta should increase.
        beta_1 = agent.update_adaptive_kl_beta(100.0)
        self.assertEqual(self.evaluate(beta_1), 1.5)

        # When KL is small, beta should decrease.
        beta_2 = agent.update_adaptive_kl_beta(1.0)
        self.assertEqual(self.evaluate(beta_2), 1.0)
  def testDropoutFCLayersWithConv(self, training):
    tf.compat.v1.set_random_seed(0)
    observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0,
                                                     1)
    time_step_spec = ts.time_step_spec(observation_spec)
    time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1,))
    action_spec = tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3)

    net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        conv_layer_params=[(4, 2, 2)],
        fc_layer_params=[5],
        dropout_layer_params=[0.5])

    action_distributions1, _ = net(
        time_step.observation, time_step.step_type, (), training=training)
    action_distributions2, _ = net(
        time_step.observation, time_step.step_type, (), training=training)
    mode1 = action_distributions1.mode()
    mode2 = action_distributions2.mode()

    self.evaluate(tf.compat.v1.global_variables_initializer())
    mode1, mode2 = self.evaluate([mode1, mode2])

    if training:
      self.assertGreater(np.linalg.norm(mode1 - mode2), 0)
    else:
      self.assertAllEqual(mode1, mode2)
    def testDropoutFCLayersWithConv(self, training):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, ))
        action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3)

        net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            conv_layer_params=[(4, 2, 2)],
            fc_layer_params=[5],
            dropout_layer_params=[0.5])

        modes = []
        num_modes = 10
        for _ in range(num_modes):
            action_distributions, _ = net(time_step.observation,
                                          time_step.step_type, (),
                                          training=training)
            modes.append(action_distributions.mode())

        self.evaluate(tf.compat.v1.global_variables_initializer())
        modes = self.evaluate(modes)

        modes_differ = False
        for i in range(num_modes):
            for j in range(i + 1, num_modes):
                modes_differ = np.linalg.norm(modes[i] - modes[j]) > 1e-6
                if modes_differ:
                    break

        self.assertEqual(training, modes_differ)
예제 #9
0
    def testAgentTransitionTrain(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec,
            self._action_spec,
            fc_layer_params=(10, ),
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        agent = sac_agent.SacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

        time_step_spec = self._time_step_spec._replace(
            reward=tensor_spec.BoundedTensorSpec(
                [], tf.float32, minimum=0.0, maximum=1.0, name='reward'))

        transition_spec = trajectory.Transition(
            time_step=time_step_spec,
            action_step=policy_step.PolicyStep(action=self._action_spec,
                                               state=(),
                                               info=()),
            next_time_step=time_step_spec)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            transition_spec, outer_dims=(3, ))
        agent.train(sample_trajectory_experience)
예제 #10
0
    def testAgentTrajectoryTrain(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec,
            self._action_spec,
            fc_layer_params=(10, ),
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        agent = sac_agent.SacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

        trajectory_spec = trajectory.Trajectory(
            step_type=self._time_step_spec.step_type,
            observation=self._time_step_spec.observation,
            action=self._action_spec,
            policy_info=(),
            next_step_type=self._time_step_spec.step_type,
            reward=tensor_spec.BoundedTensorSpec([],
                                                 tf.float32,
                                                 minimum=0.0,
                                                 maximum=1.0,
                                                 name='reward'),
            discount=self._time_step_spec.discount)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            trajectory_spec, outer_dims=(3, 2))
        agent.train(sample_trajectory_experience)
def load_reinforce_agent(train_env,
                         actor_fc_layers,
                         learning_rate,
                         num_epochs,
                         preprocessing_layers=None,
                         preprocessing_combiner=None):
    """
	Creates a REINFORCE agent, using the same inputs as load_ppo_agent. Note that the reinforce algorithm is pure policy gradient and does not have an critic (i.e., value) network. 
	"""

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    train_step_counter = tf.compat.v2.Variable(0)

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=actor_fc_layers,
        preprocessing_layers=preprocessing_layers,
        preprocessing_combiner=preprocessing_combiner)

    tf_agent = reinforce_agent.ReinforceAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        actor_network=actor_net,
        optimizer=optimizer,
        normalize_returns=False,
        train_step_counter=train_step_counter)

    tf_agent.initialize()
    return tf_agent
예제 #12
0
  def testKlCutoffLoss(self, not_zero):
    kl_cutoff_coef = 30.0 * not_zero
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        self._time_step_spec.observation,
        self._action_spec,
        fc_layer_params=None)
    value_net = value_network.ValueNetwork(
        self._time_step_spec.observation, fc_layer_params=None)
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=actor_net,
        value_net=value_net,
        kl_cutoff_factor=5.0,
        adaptive_kl_target=0.1,
        kl_cutoff_coef=kl_cutoff_coef,
    )
    kl_divergence = tf.constant([[1.5, -0.5, 6.5, -1.5, -2.3]],
                                dtype=tf.float32)
    expected_kl_cutoff_loss = kl_cutoff_coef * (.24**2)  # (0.74 - 0.5) ^ 2

    loss = agent.kl_cutoff_loss(kl_divergence)
    self.evaluate(tf.compat.v1.initialize_all_variables())
    loss_ = self.evaluate(loss)
    self.assertAllClose([loss_], [expected_kl_cutoff_loss])
예제 #13
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    time_step_spec = ts.time_step_spec(input_tensor_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    # We create a fixed mask here for testing purposes. Normally the mask would
    # be part of the observation.
    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        input_tensor_spec, action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        time_step_spec, action_spec, actor_network=actor_network,
        observation_and_action_constraint_splitter=(
            lambda observation: (observation, tf_mask)))

    # Force creation of variables before global_variables_initializer.
    policy.variables()
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times, and ensure that actions considered
    # invalid according to the mask are never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size, 1))
    self.assertAllEqual(np_mask[action], np.ones([batch_size, 1]))
예제 #14
0
    def __init__(self, strategy=None):

        self._strategy = strategy

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
            tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh)
        value_net = value_network.ValueNetwork(tensor_spec.TensorSpec(
            shape=[], dtype=tf.float32),
                                               fc_layer_params=(1, ))

        super(FakePPOAgent, self).__init__(
            time_step_spec=ts.time_step_spec(
                tensor_spec.TensorSpec(shape=[], dtype=tf.float32)),
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
            actor_net=actor_net,
            value_net=value_net,
            # Ensures value_prediction, return and normalized_advantage are included
            # as part of the training_data_spec.
            compute_value_and_advantage_in_train=False,
            update_normalizers_in_train=False,
        )
        self.train_called_times = tf.Variable(0, dtype=tf.int32)
        self.experiences = []
예제 #15
0
    def __init__(self):

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
            tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh)
        value_net = value_network.ValueNetwork(tensor_spec.TensorSpec(
            shape=[], dtype=tf.float32),
                                               fc_layer_params=(1, ))

        super(FakePPOAgent, self).__init__(
            time_step_spec=ts.time_step_spec(
                tensor_spec.TensorSpec(shape=[], dtype=tf.float32)),
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
            actor_net=actor_net,
            value_net=value_net,
            # Ensures value_prediction, return and advantage are included as parts
            # of the training_data_spec.
            compute_value_and_advantage_in_train=False,
            update_normalizers_in_train=False,
        )
        # There is an artifical call on `_train` during the initialization which
        # ensures that the variables of the optimizer are initialized. This is
        # excluded from the call count.
        self.train_called_times = -1
        self.experiences = []
    def testHandlesExtraOuterDims(self):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(3, 2, 2))

        action_spec = [
            tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3),
            tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3)
        ]

        net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            conv_layer_params=[(4, 2, 2)],
            fc_layer_params=(5, ))

        action_distributions, _ = net(time_step.observation,
                                      time_step.step_type, ())
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertEqual([3, 2, 2, 2],
                         action_distributions[0].mode().shape.as_list())
        self.assertEqual([3, 2, 2, 3],
                         action_distributions[1].mode().shape.as_list())
예제 #17
0
def create_actor_network(train_env):
    def projection_net_factory(action_spec):
        return normal_projection_network.NormalProjectionNetwork(
            action_spec,
            mean_transform=None,
            state_dependent_std=True,
            std_transform=sac_agent.std_clip_transform,
            scale_distribution=True)

    return actor_distribution_network.ActorDistributionNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        conv_layer_params=[
            (4, (5, 1), 1),
            (4, (1, 5), 2),
            (8, (5, 1), 1),
            (8, (1, 5), 2),
            (16, (5, 1), 1),
            (16, (1, 5), 2),
            (32, (5, 1), 1),
            (32, (1, 5), 2),
        ],
        fc_layer_params=[128, 128],
        continuous_projection_net=projection_net_factory,
    )
예제 #18
0
    def test_tf_agents_on_policy_agent(self):
        learning_rate = 1e-3
        actor_fc_layers = (200, 100)
        value_fc_layers = (200, 100)
        env_name = "CartPole-v0"
        gym_env = gym.make(env_name)
        model_name = "ppo_tf_agent"
        train_env = environment_converter.gym_to_tf(gym_env)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=actor_fc_layers,
        )
        value_net = value_network.ValueNetwork(train_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)
        agent = ppo_agent.PPOAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
        )
        agent.initialize()

        # Train
        train(agent, gym_env, 2000, 195, model_name, 200)
        trained_env = get_saved_environments()[0]
        trained_models = get_trained_model_names(trained_env)
        model_saved = model_name in trained_models
        shutil.rmtree(save_path)
        self.assertTrue(model_saved)
예제 #19
0
    def __init__(self):

        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1,
                                                           1)

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        value_net = value_network.ValueNetwork(observation_tensor_spec,
                                               fc_layer_params=(1, ))

        super(PPOAgentActorDist, self).__init__(
            time_step_spec=ts.time_step_spec(observation_tensor_spec),
            action_spec=action_tensor_spec,
            actor_net=actor_net,
            value_net=value_net,
            # Ensures value_prediction, return and advantage are included as parts
            # of the training_data_spec.
            compute_value_and_advantage_in_train=True,
            update_normalizers_in_train=False,
            optimizer=tf.compat.v1.train.AdamOptimizer(),
        )
        # There is an artifical call on `_train` during the initialization which
        # ensures that the variables of the optimizer are initialized. This is
        # excluded from the call count.
        self.train_called_times = -1
        self.experiences = []
예제 #20
0
    def test_same_actor_net_output(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net_sequential = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)

        actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        sample_observation = tf.constant([[1], [2]], dtype=tf.float32)
        tf.random.set_seed(111)
        sequential_output_dist, _ = actor_net_sequential(
            sample_observation, step_type=ts.StepType.MID, network_state=())
        tf.random.set_seed(111)
        actor_dist_output_dist, _ = actor_net_actor_dist(
            sample_observation, step_type=ts.StepType.MID, network_state=())
        self.assertAllEqual(sequential_output_dist.mean(),
                            actor_dist_output_dist.mean())
        self.assertAllEqual(sequential_output_dist.stddev(),
                            actor_dist_output_dist.stddev())
예제 #21
0
    def test_same_policy_same_output(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        value_net = value_network.ValueNetwork(observation_tensor_spec,
                                               fc_layer_params=(1, ))

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net_sequential = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)
        actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        tf.random.set_seed(111)
        seq_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_sequential,
            value_net,
            collect=True)
        tf.random.set_seed(111)
        actor_dist_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_actor_dist,
            value_net,
            collect=True)

        sample_timestep = ts.TimeStep(step_type=tf.constant([1, 1],
                                                            dtype=tf.int32),
                                      reward=tf.constant([1, 1],
                                                         dtype=tf.float32),
                                      discount=tf.constant([1, 1],
                                                           dtype=tf.float32),
                                      observation=tf.constant(
                                          [[1], [2]], dtype=tf.float32))
        seq_policy_step = seq_policy._distribution(sample_timestep,
                                                   policy_state=())
        act_dist_policy_step = actor_dist_policy._distribution(sample_timestep,
                                                               policy_state=())

        seq_scale = seq_policy_step.info['dist_params']['scale_diag']
        act_dist_scale = act_dist_policy_step.info['dist_params']['scale']
        self.assertAllEqual(seq_scale, act_dist_scale)
        self.assertAllEqual(seq_policy_step.info['dist_params']['loc'],
                            act_dist_policy_step.info['dist_params']['loc'])
예제 #22
0
  def testBuild(self):
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec, self._action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    self.assertEqual(policy.time_step_spec, self._time_step_spec)
    self.assertEqual(policy.action_spec, self._action_spec)
예제 #23
0
def get_networks(tf_env, actor_fc_layers, value_fc_layers):
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        fc_layer_params=actor_fc_layers)
    value_net = value_network.ValueNetwork(tf_env.observation_spec(),
                                           fc_layer_params=value_fc_layers)
    return actor_net, value_net
def load_ppo_agent(train_env,
                   actor_fc_layers,
                   value_fc_layers,
                   learning_rate,
                   num_epochs,
                   preprocessing_layers=None,
                   preprocessing_combiner=None):
    """
	Function which creates a tensorflow agent for a given environment with specified parameters, which uses the 
	proximal policy optimization (PPO) algorithm for training. 
	actor_fc_layers: tuple of integers, indicating the number of units in intermediate layers of the actor network. All layers are Keras Dense layers
	actor_fc_layers: same for value network
	preprocessing_layers: already-contructed layers of the preprocessing networks, which converts observations to tensors. Needed when the observation spec is either a list or dictionary
	preprocessing_combiner: combiner for the preprocessing networks, typically by concatenation. 
	learning_rate: learning rate, recommended value 0.001 or less
	num_epochs: number of training epochs which the agent executes per batch of collected episodes. 
	
	For more details on PPO, see the documentation of tf_agents: https://github.com/tensorflow/agents/tree/master/tf_agents
	or the paper: https://arxiv.org/abs/1707.06347
	"""

    optimizer = tf.compat.v1.train.AdamOptimizer(
        learning_rate=learning_rate
    )  #using Adam, a learning rule which uses only first-order gradients but incorporates momentum to become approximately second-order

    train_step_counter = tf.compat.v2.Variable(
        0)  #this creates a counter that starts at 0

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        preprocessing_combiner=preprocessing_combiner,
        preprocessing_layers=preprocessing_layers,
        fc_layer_params=actor_fc_layers,
    )
    value_net = value_network.ValueNetwork(
        train_env.observation_spec(),
        preprocessing_combiner=preprocessing_combiner,
        preprocessing_layers=preprocessing_layers,
        fc_layer_params=value_fc_layers)

    tf_agent = ppo_agent.PPOAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        optimizer=optimizer,
        actor_net=actor_net,
        value_net=value_net,
        num_epochs=num_epochs,
        train_step_counter=train_step_counter,
        normalize_rewards=
        False,  #This is crucial to avoid the agent geting stuck
        normalize_observations=False,  #same
        discount_factor=1.0,
    )

    tf_agent.initialize(
    )  #This is necessary to create variables for the networks
    return tf_agent
예제 #25
0
    def GetAgent(self, env, params):
        def _normal_projection_net(action_spec, init_means_output_factor=0.1):
            return normal_projection_network.NormalProjectionNetwork(
                action_spec,
                mean_transform=None,
                state_dependent_std=True,
                init_means_output_factor=init_means_output_factor,
                std_transform=sac_agent.std_clip_transform,
                scale_distribution=True)

        # actor network
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            env.observation_spec(),
            env.action_spec(),
            fc_layer_params=tuple(
                self._params["ML"]["BehaviorSACAgent"]["ActorFcLayerParams",
                                                       "", [512, 256, 256]]),
            continuous_projection_net=_normal_projection_net)

        # critic network
        critic_net = critic_network.CriticNetwork(
            (env.observation_spec(), env.action_spec()),
            observation_fc_layer_params=None,
            action_fc_layer_params=None,
            joint_fc_layer_params=tuple(self._params["ML"]["BehaviorSACAgent"][
                "CriticJointFcLayerParams", "", [512, 256, 256]]))

        # agent
        tf_agent = sac_agent.SacAgent(
            env.time_step_spec(),
            env.action_spec(),
            actor_network=actor_net,
            critic_network=critic_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._params["ML"]["BehaviorSACAgent"][
                    "ActorLearningRate", "", 3e-4]),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._params["ML"]["BehaviorSACAgent"][
                    "CriticLearningRate", "", 3e-4]),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._params["ML"]["BehaviorSACAgent"][
                    "AlphaLearningRate", "", 3e-4]),
            target_update_tau=self._params["ML"]["BehaviorSACAgent"][
                "TargetUpdateTau", "", 0.05],
            target_update_period=self._params["ML"]["BehaviorSACAgent"][
                "TargetUpdatePeriod", "", 3],
            td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gamma=self._params["ML"]["BehaviorSACAgent"]["Gamma", "", 0.995],
            reward_scale_factor=self._params["ML"]["BehaviorSACAgent"][
                "RewardScaleFactor", "", 1.],
            train_step_counter=self._ckpt.step,
            name=self._params["ML"]["BehaviorSACAgent"]["AgentName", "",
                                                        "sac_agent"],
            debug_summaries=self._params["ML"]["BehaviorSACAgent"][
                "DebugSummaries", "", False])

        tf_agent.initialize()
        return tf_agent
예제 #26
0
파일: ppo.py 프로젝트: samsinai/FLEXS
    def __init__(
        self,
        model: flexs.Model,
        rounds: int,
        sequences_batch_size: int,
        model_queries_per_batch: int,
        starting_sequence: str,
        alphabet: str,
        log_file: Optional[str] = None,
    ):
        """Create PPO explorer."""
        super().__init__(
            model,
            "PPO_Agent",
            rounds,
            sequences_batch_size,
            model_queries_per_batch,
            starting_sequence,
            log_file,
        )

        self.alphabet = alphabet

        # Initialize tf_environment
        env = PPOEnv(
            alphabet=self.alphabet,
            starting_seq=starting_sequence,
            model=self.model,
            max_num_steps=self.model_queries_per_batch,
        )
        self.tf_env = tf_py_environment.TFPyEnvironment(env)

        encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"])
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self.tf_env.observation_spec(),
            self.tf_env.action_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )
        value_net = value_network.ValueNetwork(
            self.tf_env.observation_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )

        # Create the PPO agent
        self.agent = ppo_agent.PPOAgent(
            time_step_spec=self.tf_env.time_step_spec(),
            action_spec=self.tf_env.action_spec(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=10,
            summarize_grads_and_vars=False,
        )
        self.agent.initialize()
예제 #27
0
def get_d4rl_policy(env, weights, is_dapg=False):
    """Creates TF Agents policy based from D4RL saved weights."""
    hidden_dims = []
    fc_idx = 0
    while 'fc%d/weight' % fc_idx in weights:
        hidden_dims.append(np.shape(weights['fc0/weight'])[0])
        fc_idx += 1

    if is_dapg:
        activation_fn = tf.keras.activations.tanh
        continuous_projection_net = functools.partial(
            normal_projection_network.NormalProjectionNetwork,
            mean_transform=None,
            std_transform=tf.exp,
            state_dependent_std=True)
    else:
        activation_fn = tf.keras.activations.relu
        continuous_projection_net = functools.partial(
            tanh_normal_projection_network.TanhNormalProjectionNetwork,
            std_transform=lambda x: tf.exp(tf.clip_by_value(x, -5., 2.)))

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        env.observation_spec(),
        env.action_spec(),
        fc_layer_params=hidden_dims,
        continuous_projection_net=continuous_projection_net,
        activation_fn=activation_fn)
    policy = actor_policy.ActorPolicy(time_step_spec=env.time_step_spec(),
                                      action_spec=env.action_spec(),
                                      actor_network=actor_net,
                                      training=False)

    # Set weights
    # pylint: disable=protected-access
    for fc_idx in range(len(hidden_dims)):
        actor_net._encoder.layers[fc_idx + 1].set_weights(
            [weights['fc%d/weight' % fc_idx].T, weights['fc%d/bias' % fc_idx]])

    if is_dapg:
        actor_net._projection_networks.layers[0].set_weights(
            [weights['last_fc/weight'].T, weights['last_fc/bias']])
        actor_net._projection_networks.layers[1].set_weights([
            weights['last_fc_log_std/weight'].T,
            weights['last_fc_log_std/bias']
        ])
    else:
        actor_net._projection_networks.layers[0].set_weights([
            np.concatenate(
                (weights['last_fc/weight'], weights['last_fc_log_std/weight']),
                axis=0).T,
            np.concatenate(
                (weights['last_fc/bias'], weights['last_fc_log_std/bias']),
                axis=0)
        ])
    # pylint: enable=protected-access
    return policy
예제 #28
0
 def __init__(self, env):
     self.env = env
     self.net = actor_distribution_network.ActorDistributionNetwork(
         self.env.observation_spec(),
         self.env.action_spec(),
         conv_layer_params=[(32, 5, 1), (64, 5, 2),
                            (128, 5, 2), (256, 5, 2)],
         fc_layer_params=(64, 2))
     self.optimizer = tf.compat.v1.train.AdamOptimizer(
         learning_rate=1e-3)
     self.strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)
예제 #29
0
def get_env_and_policy_from_weights(env_name: str,
                                    weights: Mapping[str, np.ndarray],
                                    n_hidden: int = 300,
                                    min_log_std: float = -5,
                                    max_log_std: float = 2):
  """Return tf_env and policy from dictionary of weights.

  Assumes that the policy has 2 hidden layers with 300 units, ReLu activations,
  and outputs a normal distribution squashed by a Tanh.

  Args:
    env_name: Name of the environment.
    weights: Dictionary of weights containing keys: fc0/weight, fc0/bias,
      fc0/weight, fc0/bias, last_fc/weight, last_fc_log_std/weight,
      last_fc/bias, last_fc_log_std/bias

  Returns:
    tf_env: TF wrapped env.
    policy: TF Agents policy.
  """
  env = suites.load_mujoco(env_name)
  tf_env = tf_py_environment.TFPyEnvironment(env)
  std_transform = (
      lambda x: tf.exp(tf.clip_by_value(x, min_log_std, max_log_std)))
  actor_net = actor_distribution_network.ActorDistributionNetwork(
      tf_env.observation_spec(),
      tf_env.action_spec(),
      fc_layer_params=(n_hidden, n_hidden),
      continuous_projection_net=functools.partial(
          tanh_normal_projection_network.TanhNormalProjectionNetwork,
          std_transform=std_transform),
      activation_fn=tf.keras.activations.relu,
  )
  policy = actor_policy.ActorPolicy(
      time_step_spec=tf_env.time_step_spec(),
      action_spec=tf_env.action_spec(),
      actor_network=actor_net,
      training=False)

  # Set weights
  actor_net._encoder.layers[1].set_weights(  # pylint: disable=protected-access
      [weights['fc0/weight'].T, weights['fc0/bias']])
  actor_net._encoder.layers[2].set_weights(  # pylint: disable=protected-access
      [weights['fc1/weight'].T, weights['fc1/bias']])
  actor_net._projection_networks.layers[0].set_weights(  # pylint: disable=protected-access
      [
          np.concatenate(
              (weights['last_fc/weight'], weights['last_fc_log_std/weight']),
              axis=0).T,
          np.concatenate(
              (weights['last_fc/bias'], weights['last_fc_log_std/bias']),
              axis=0)
      ])
  return tf_env, policy
예제 #30
0
def get_sac_policy(tf_env):
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        fc_layer_params=(256, 256),
        continuous_projection_net=tanh_normal_projection_network.
        TanhNormalProjectionNetwork)
    policy = actor_policy.ActorPolicy(time_step_spec=tf_env.time_step_spec(),
                                      action_spec=tf_env.action_spec(),
                                      actor_network=actor_net,
                                      training=False)
    return policy