예제 #1
0
  def __init__(self, time_step_spec, action_spec, seed=None, outer_dims=None):
    """Initializes the RandomPyPolicy.

    Args:
      time_step_spec: Reference `time_step_spec`. If not None and outer_dims
        is not provided this is used to infer the outer_dims required for the
        given time_step when action is called.
      action_spec: A nest of BoundedArraySpec representing the actions to sample
        from.
      seed: Optional seed used to instantiate a random number generator.
      outer_dims: An optional list/tuple specifying outer dimensions to add to
        the spec shape before sampling. If unspecified the outer_dims are
        derived from the outer_dims in the given observation when `action` is
        called.
    """

    self._seed = seed
    self._outer_dims = outer_dims
    self._rng = np.random.RandomState(seed)
    if time_step_spec is None:
      time_step_spec = ts.time_step_spec()

    super(RandomPyPolicy, self).__init__(
        time_step_spec=time_step_spec, action_spec=action_spec)
예제 #2
0
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, 4, add_num_actions_feature=True)
     time_step_spec = ts.time_step_spec(obs_spec)
     reward_net = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32),
         bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
         tf.ones([2], dtype=tf.int32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
예제 #3
0
    def testDropoutFCLayersWithConv(self, training):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, ))
        action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3)

        net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            conv_layer_params=[(4, 2, 2)],
            fc_layer_params=[5],
            dropout_layer_params=[0.5])

        modes = []
        num_modes = 10
        for _ in range(num_modes):
            action_distributions, _ = net(time_step.observation,
                                          time_step.step_type, (),
                                          training=training)
            modes.append(action_distributions.mode())

        self.evaluate(tf.compat.v1.global_variables_initializer())
        modes = self.evaluate(modes)

        # Verify that the modes from action_distributions are not all the same.
        any_modes_differ = False
        for i in range(num_modes):
            for j in range(i + 1, num_modes):
                any_modes_differ = np.linalg.norm(modes[i] - modes[j]) > 1e-6
                if any_modes_differ:
                    self.assertEqual(training, any_modes_differ)
                    return

        self.assertEqual(training, any_modes_differ)
예제 #4
0
    def test_handle_preprocessing_layers(self, outer_dims):
        observation_spec = (tensor_spec.TensorSpec([1], tf.float32),
                            tensor_spec.TensorSpec([], tf.float32))
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(
            time_step_spec, outer_dims=outer_dims)

        action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3)

        preprocessing_layers = (tf.keras.layers.Dense(4),
                                sequential_layer.SequentialLayer([
                                    tf.keras.layers.Reshape((1, )),
                                    tf.keras.layers.Dense(4)
                                ]))

        net = actor_network.ActorNetwork(
            observation_spec,
            action_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=tf.keras.layers.Add())

        action, _ = net(time_step.observation, time_step.step_type, ())
        self.assertEqual(list(outer_dims) + [2], action.shape.as_list())
        self.assertGreater(len(net.trainable_variables), 4)
예제 #5
0
    def testExp3Update(self, observation_shape, num_actions, action, log_prob,
                       reward, learning_rate):
        """Check EXP3 updates for specified actions and rewards."""

        # Compute expected update for each action.
        expected_update_value = exp3_agent.exp3_update_value(reward, log_prob)
        expected_update = np.zeros(num_actions)
        for a, u in zip(action, self.evaluate(expected_update_value)):
            expected_update[a] += u

        # Construct a `Trajectory` for the given action, log prob and reward.
        time_step_spec = time_step.time_step_spec(
            tensor_spec.TensorSpec(observation_shape, tf.float32))
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        initial_step, final_step = _get_initial_and_final_steps(
            observation_shape, reward)
        action_step = _get_action_step(action, log_prob)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update. Record initial and final
        # weights.
        agent = exp3_agent.Exp3Agent(time_step_spec=time_step_spec,
                                     action_spec=action_spec,
                                     learning_rate=learning_rate)
        self.evaluate(agent.initialize())
        initial_weights = self.evaluate(agent.weights)
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_weights = self.evaluate(agent.weights)
        update = final_weights - initial_weights

        # Check that the actual update matches expectations.
        self.assertAllClose(expected_update, update)
예제 #6
0
 def testTrainAgentWithMask(self):
   reward_net = DummyNet(self._observation_spec, self._action_spec)
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
   time_step_spec = ts.time_step_spec((tensor_spec.TensorSpec([2], tf.float32),
                                       tensor_spec.TensorSpec([3], tf.int32)))
   agent = greedy_agent.GreedyRewardPredictionAgent(
       time_step_spec,
       self._action_spec,
       reward_network=reward_net,
       optimizer=optimizer,
       observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))
   observations = (np.array([[1, 2], [3, 4]], dtype=np.float32),
                   np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32))
   actions = np.array([0, 1], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
       observations, rewards)
   action_step = _get_action_step(actions)
   experience = _get_experience(initial_step, action_step, final_step)
   loss_before, _ = agent.train(experience, None)
   loss_after, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   self.assertAllClose(self.evaluate(loss_before), 42.25)
   self.assertAllClose(self.evaluate(loss_after), 93.46)