Пример #1
0
    def testCriticLossWithMaskedActions(self):
        # Observations are now a tuple of the usual observation and an action mask.
        observation_spec_with_mask = (self._obs_spec,
                                      tensor_spec.BoundedTensorSpec([2],
                                                                    tf.int32,
                                                                    0, 1))
        time_step_spec = ts.time_step_spec(observation_spec_with_mask)
        dummy_categorical_net = DummyCategoricalNet(self._obs_spec)
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            time_step_spec,
            self._action_spec,
            dummy_categorical_net,
            self._optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        # For `observations`, the masks are set up so that only one action is valid
        # for each element in the batch.
        observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
                        tf.constant([[1, 0], [0, 1]], dtype=tf.int32))
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        # For `next_observations`, the masks are set up so the opposite actions as
        # before are valid.
        next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32),
                             tf.constant([[0, 1], [1, 0]], dtype=tf.int32))
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Due to the constant initialization of the DummyCategoricalNet, we can
        # expect the same loss every time. Note this is different from the loss in
        # testCriticLoss above due to previously optimal actions being masked out.
        expected_loss = 5.062895
        loss_info = agent._loss(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        evaluated_loss = self.evaluate(loss_info).loss
        self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Пример #2
0
    def testTrainPerArmAgentWithConstraint(self):
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
        reward_spec = tensor_spec.TensorSpec(shape=(2, ),
                                             dtype=tf.float32,
                                             name='reward')
        time_step_spec = ts.time_step_spec(obs_spec, reward_spec)
        reward_net = (global_and_arm_feature_network.
                      create_feed_forward_common_tower_network(
                          obs_spec, (4, 3), (3, 4), (4, 2)))
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=0.1)
        constraint_net = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec, (4, 3), (3, 4), (4, 2)))
        neural_constraint = constraints.NeuralConstraint(
            time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            accepts_per_arm_features=True,
            optimizer=optimizer,
            constraints=[neural_constraint])
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                    dtype=tf.float32)
        }
        actions = np.array([0, 3], dtype=np.int32)
        rewards = np.array([[0.5, 6.0], [3.0, 4.0]], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps(
            observations, rewards)
        action_step = policy_step.PolicyStep(
            action=tf.convert_to_tensor(actions),
            info=policy_utilities.PerArmPolicyInfo(
                chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                             dtype=np.float32)))
        experience = _get_experience(initial_step, action_step, final_step)
        agent.train(experience, None)
        self.evaluate(tf.compat.v1.initialize_all_variables())
Пример #3
0
  def testLossWithChangedOptimalActions(self, agent_class):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    agent = agent_class(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

    # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7.
    next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
    # [[1], [1]] from DummyNet above, we can calculate the following values:
    # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
    # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
    # (Here we use the second row of the kernel initializer above, since the
    # chosen action is now 1 instead of 0.)
    #
    # For the target Q-values here, note that since we've replaced 5 and 7 with
    # -5 and -7, it is better to use action 1 with a kernel of [1, 1] instead of
    # action 0 with a kernel of [2, 1].
    # Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2
    # Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2
    # TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8
    # TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8
    # TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5)
    # Overall loss: (6.3 + 13.3) / 2 = 9.8
    expected_loss = 9.8
    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllClose(self.evaluate(loss), expected_loss)
Пример #4
0
    def _action(self, time_step, policy_state, seed):
        seed_stream = tfd.SeedStream(seed=seed, salt='epsilon_greedy')
        greedy_action = self._greedy_policy.action(time_step, policy_state)
        random_action = self._random_policy.action(time_step, (),
                                                   seed_stream())

        outer_shape = nest_utils.get_outer_shape(time_step,
                                                 self._time_step_spec)
        rng = tf.random.uniform(outer_shape,
                                maxval=1.0,
                                seed=seed_stream(),
                                name='epsilon_rng')
        cond = tf.greater(rng, self._get_epsilon())

        # Selects the action/info from the random policy with probability epsilon.
        # TODO(b/133175894): tf.compat.v1.where only supports a condition which is
        # either a scalar or a vector. Use tf.compat.v2 so that it can support any
        # condition whose leading dimensions are the same as the other operands of
        # tf.where.
        outer_ndims = int(outer_shape.shape[0])
        if outer_ndims >= 2:
            raise ValueError(
                'Only supports batched time steps with a single batch dimension'
            )
        action = tf.compat.v1.where(cond, greedy_action.action,
                                    random_action.action)

        if greedy_action.info:
            if not random_action.info:
                raise ValueError('Incompatible info field')
            info = tf.compat.v1.where(cond, greedy_action.info,
                                      random_action.info)
        else:
            if random_action.info:
                raise ValueError('Incompatible info field')
            info = ()

        # The state of the epsilon greedy policy is the state of the underlying
        # greedy policy (the random policy carries no state).
        # It is commonly assumed that the new policy state only depends only
        # on the previous state and "time_step", the action (be it the greedy one
        # or the random one) does not influence the new policy state.
        state = greedy_action.state

        return policy_step.PolicyStep(action, state, info)
Пример #5
0
    def _action(self, time_step, policy_state, seed):
        i = policy_state[0] % self.period # position within the policy period
        out_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
        action = {}
        for a in self.script.keys():
            A = common.replicate(self.script[a][i], out_shape)
            if a == 'alpha':
                m = time_step.observation['msmt']
                if i not in [2*self.K-1, 2*self.K]:
                    # feedback after trimming rounds is Markovian, and after
                    # intermediate sharpening rounds is simply zero.
                    A *= m[:,-1,:]
                    if policy_state[0] == 0: A *= 0
                else: # after K sharpening rounds do the Baysian feedback
                    A = self.Bayesian_feedback(i, m)
            action[a] = A

        return policy_step.PolicyStep(action, policy_state+1, self._policy_info)
Пример #6
0
    def _create_replay_buffer(self, rb_cls):
        self._stack_count = 4
        self._single_shape = (15, 15, 1)
        shape = (15, 15, self._stack_count)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = 32
        self._replay_buffer = rb_cls(data_spec=self._trajectory_spec,
                                     capacity=self._capacity)
Пример #7
0
def create_transition(state: types.Array, action: types.Array,
                      next_state: types.Array, discount: types.Array,
                      reward: types.Array, step_type: types.Array,
                      next_step_type: types.Array) -> trajectory.Transition:
    """Creates a Transition from current and next state information."""
    tfagents_time_step = ts.TimeStep(
        step_type=step_type,
        reward=np.zeros_like(reward),  # unknown
        discount=np.zeros_like(discount),  # unknown
        observation=state)
    action_step = policy_step.PolicyStep(action=action, state=(), info=())
    tfagents_next_time_step = ts.TimeStep(step_type=next_step_type,
                                          reward=reward,
                                          discount=discount,
                                          observation=next_state)
    return trajectory.Transition(time_step=tfagents_time_step,
                                 action_step=action_step,
                                 next_time_step=tfagents_next_time_step)
Пример #8
0
    def _distribution(
            self, time_step: TimeStep,
            policy_state: types.NestedTensorSpec) -> policy_step.PolicyStep:
        """
        This method returns a distribution over actions for a particular step within the planner
        horizon. The cross entropy method attempts to find the optimal set of actions up to the
        planner horizon, by maintaining a distribution over actions for each time step, up to the
        planning horizon.

        This policy uses a batched `step_index` counter to track which time step index has been
        reached in each of the batched trajectories.

        If a trajectory in the batch terminates within the planning horizon the policy has to
        choose an action for the final time step. This action is never used. In this case we
        decrement the step counter before returning the action distribution. As such the action
        distribution for the final time step in a trajectory will be identical to the action
        distribution of the previous time step. This is done because, for each termination time
        step along a trajectory, the total number of time steps in the element of the batch
        increases by one. This is a problem for the cross-entropy policy which optimises "planning
        horizon" action distributions.
        """
        # mean, var, low and high shapes = (horizon + 1,) + action_space.shape
        mean, var, low, high, batch_actions, step_index = policy_state

        assert tf.reduce_all(
            step_index <= self._horizon
        ), f"Max step index {max(step_index)} is out of range (> {self._horizon})"

        actions = tf.gather_nd(batch_actions,
                               step_index[:, None],
                               batch_dims=1)

        distribution = tfp.distributions.Deterministic(actions)

        step_index_increment = tf.where(time_step.is_last(), 0, 1)

        policy_state = tf.nest.pack_sequence_as(
            self._policy_state_spec,
            [
                mean, var, low, high, batch_actions,
                step_index + step_index_increment
            ],
        )
        return policy_step.PolicyStep(distribution, policy_state)
Пример #9
0
    def testLoss(self, agent_class, run_mode):
        if tf.executing_eagerly() and run_mode == context.graph_mode:
            self.skipTest('b/123778560')
        with run_mode(), tf.compat.v2.summary.record_if(False):
            q_net = DummyNet(self._observation_spec, self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)

            observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
            time_steps = ts.restart(observations, batch_size=2)

            actions = [tf.constant([[0], [1]], dtype=tf.int32)]
            action_steps = policy_step.PolicyStep(actions)

            rewards = tf.constant([10, 20], dtype=tf.float32)
            discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
            next_observations = [
                tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
            ]
            next_time_steps = ts.transition(next_observations, rewards,
                                            discounts)

            experience = test_utils.stacked_trajectory_from_transition(
                time_steps, action_steps, next_time_steps)

            # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
            # [[1], [1]] from DummyNet above, we can calculate the following values:
            # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
            # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
            # (Here we use the second row of the kernel initializer above, since the
            # chosen action is now 1 instead of 0.)
            # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
            # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
            # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7
            # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7
            # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5)
            # Overall loss: (19.8 + 32.2) / 2 = 26
            expected_loss = 26.0
            loss, _ = agent._loss(experience)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertAllClose(self.evaluate(loss), expected_loss)
Пример #10
0
  def _action(self, time_step, policy_state, seed):
    observation_and_action_constraint_splitter = (
        self.observation_and_action_constraint_splitter)

    if observation_and_action_constraint_splitter is not None:
      _, mask = observation_and_action_constraint_splitter(
          time_step.observation)

      zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
      masked_categorical = masked.MaskedCategorical(zero_logits, mask)
      action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum,
                        self.action_spec.dtype)

      # If the action spec says each action should be shaped (1,), add another
      # dimension so the final shape is (B, 1) rather than (B,).
      if self.action_spec.shape.rank == 1:
        action_ = tf.expand_dims(action_, axis=-1)
      policy_info = tensor_spec.sample_spec_nest(self._info_spec)
    else:
      outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)

      action_ = tensor_spec.sample_spec_nest(
          self._action_spec, seed=seed, outer_dims=outer_dims)
      policy_info = tensor_spec.sample_spec_nest(
          self._info_spec, outer_dims=outer_dims)

    # TODO(b/78181147): Investigate why this control dependency is required.
    if time_step is not None:
      with tf.control_dependencies(tf.nest.flatten(time_step)):
        action_ = tf.nest.map_structure(tf.identity, action_)

    if self.emit_log_probability:
      if observation_and_action_constraint_splitter is not None:
        log_probability = masked_categorical.log_prob(action_ -
                                                      self.action_spec.minimum)
      else:
        action_probability = tf.nest.map_structure(_uniform_probability,
                                                   self._action_spec)
        log_probability = tf.nest.map_structure(tf.math.log, action_probability)
      policy_info = policy_step.set_log_probability(policy_info,
                                                    log_probability)

    step = policy_step.PolicyStep(action_, policy_state, policy_info)
    return step
Пример #11
0
    def _distribution(self, time_step, policy_state):
        # In DQN, we always either take a uniformly random action, or the action
        # with the highest Q-value. However, to support more complicated policies,
        # we expose all Q-values as a categorical distribution with Q-values as
        # logits, and apply the GreedyPolicy wrapper in dqn_agent.py to select the
        # action with the highest Q-value.
        q_values, policy_state = self._q_network(time_step.observation,
                                                 time_step.step_type,
                                                 policy_state)

        # TODO(b/122314058): Validate and enforce that sampling distributions
        # created with the q_network logits generate the right action shapes. This
        # is curretly patching the problem.

        # If the action spec says each action should be shaped (1,), add another
        # dimension so the final shape is (B, 1, A), where A is the number of
        # actions. This will make Categorical emit events shaped (B, 1) rather than
        # (B,). Using axis -2 to allow for (B, T, 1, A) shaped q_values.
        if self._flat_action_spec.shape.ndims == 1:
            q_values = tf.expand_dims(q_values, -2)

        logits = q_values
        mask_split_fn = self._q_network.mask_split_fn

        if mask_split_fn:
            _, mask = mask_split_fn(time_step.observation)

            # Expand the mask as needed in the same way as q_values above.
            if self._flat_action_spec.shape.ndims == 1:
                mask = tf.expand_dims(mask, -2)

            # Overwrite the logits for invalid actions to -inf.
            neg_inf = tf.constant(-np.inf, dtype=logits.dtype)
            logits = tf.compat.v2.where(tf.cast(mask, tf.bool), logits,
                                        neg_inf)

        # TODO(kbanoop): Handle distributions over nests.
        distribution = shifted_categorical.ShiftedCategorical(
            logits=logits,
            dtype=self._flat_action_spec.dtype,
            shift=self._flat_action_spec.minimum)
        distribution = tf.nest.pack_sequence_as(self._action_spec,
                                                [distribution])
        return policy_step.PolicyStep(distribution, policy_state)
Пример #12
0
 def sample(self, batch_size):
     dummy_action_step = policy_step.PolicyStep(
         action=tf.constant([tf.int32.min]))
     dummy_time_step = ts.TimeStep(step_type=tf.constant([tf.int32.min]),
                                   reward=(np.nan * tf.ones(1)),
                                   discount=(np.nan * tf.ones(1)),
                                   observation=None)
     trajs = []
     for transition in random.sample(self.buffer, batch_size):
         traj1 = trajectory.from_transition(transition.time_step,
                                            transition.action_step,
                                            transition.next_time_step)
         traj2 = trajectory.from_transition(transition.next_time_step,
                                            dummy_action_step,
                                            dummy_time_step)
         trajs.append(
             nest_utils.unbatch_nested_tensors(
                 nest_utils.stack_nested_tensors([traj1, traj2], axis=1)))
     return nest_utils.stack_nested_tensors(trajs)
Пример #13
0
    def _action(self, time_step, policy_state, seed):
        sign = tf.cast(tf.sign(time_step.observation[0, 0]), dtype=tf.int32)

        def case_unknown_fn():
            # Choose 1 so that we get information on the sign.
            return tf.constant(1, shape=(1, ))

        # Choose 0 or 2, depending on the situation and the sign of the observation.
        def case_normal_fn():
            return tf.constant(sign + 1, shape=(1, ))

        def case_flipped_fn():
            return tf.constant(1 - sign, shape=(1, ))

        cases = [(tf.equal(self._situation, 0), case_unknown_fn),
                 (tf.equal(self._situation, 1), case_normal_fn),
                 (tf.equal(self._situation, 2), case_flipped_fn)]
        action = tf.case(cases, exclusive=True)
        return policy_step.PolicyStep(action, policy_state)
Пример #14
0
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, 4, add_num_actions_feature=True)
     time_step_spec = ts.time_step_spec(observation_spec=obs_spec,
                                        reward_spec=tensor_spec.TensorSpec(
                                            [3], tf.float32))
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     networks_and_loss_fns = [
         (global_and_arm_feature_network.
          create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                   (4, 2)),
          tf.compat.v1.losses.mean_squared_error) for _ in range(3)
     ]
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.01)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         time_step_spec,
         action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=networks_and_loss_fns,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32),
         bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
         tf.ones([2], dtype=tf.int32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, objectives)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
Пример #15
0
  def _action(self, time_step, policy_state, seed):
    outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)

    action_ = tensor_spec.sample_spec_nest(
        self._action_spec, seed=seed, outer_dims=outer_dims)
    # TODO(b/78181147): Investigate why this control dependency is required.
    if time_step is not None:
      with tf.control_dependencies(tf.nest.flatten(time_step)):
        action_ = tf.nest.map_structure(tf.identity, action_)
    step = policy_step.PolicyStep(action_, policy_state)

    if self.emit_log_probability:
      action_probability = tf.nest.map_structure(_uniform_probability,
                                                 self._action_spec)
      log_probability = tf.nest.map_structure(tf.math.log, action_probability)
      info = policy_step.PolicyInfo(log_probability=log_probability)
      return step._replace(info=info)

    return step
Пример #16
0
    def _action(self, time_step, policy_state, seed):
        # Get action from the wrapped policy.
        wrapped_policy_state, moving_average = policy_state
        wrapped_policy_step = self._wrapped_policy.action(
            time_step, wrapped_policy_state, seed)

        # Compute smoothed action & updated action tensor.
        def _smooth_action_tensor(smoothing_state_tensor, action_tensor):
            return (smoothing_state_tensor * self._smoothing_coefficient +
                    action_tensor * (1.0 - self._smoothing_coefficient))

        smoothed_action = tf.nest.map_structure(_smooth_action_tensor,
                                                moving_average,
                                                wrapped_policy_step.action)

        # Package results in PolicyStep.
        return policy_step.PolicyStep(
            smoothed_action, (wrapped_policy_step.state, smoothed_action),
            wrapped_policy_step.info)
 def _action(self,
             time_step,
             policy_state=(),
             seed: Optional[types.Seed] = None):
   if seed is not None:
     raise NotImplementedError(
         'seed is not supported; but saw seed: {}'.format(seed))
   self._count += 1
   # _random_function()'s range should be [0, 1), so if epsilon is 1,
   # we should always use random policy, and if epislon is 0, it
   # should always use greedy_policy since the if condition won't be
   # met.
   if self._random_function() < self._get_epsilon():
     # Avoid mixing policy_state from greedy_policy and random_policy,
     # always return policy_state from greedy_policy.
     action_step = self._random_policy.action(time_step)
     return policy_step.PolicyStep(action_step.action, policy_state)
   else:
     return self._greedy_policy.action(time_step, policy_state=policy_state)
Пример #18
0
def to_transition_spec(
    trajectory_spec: Trajectory
) -> Tuple[ts.TimeStep, policy_step.PolicyStep, ts.TimeStep]:
    """Create a transition spec from a trajectory spec.

  Args:
    trajectory_spec: An instance of `Trajectory` representing trajectory specs.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)` specs.
  """
    policy_step_spec = policy_step.PolicyStep(action=trajectory_spec.action,
                                              state=(),
                                              info=trajectory_spec.policy_info)
    time_step_spec = ts.TimeStep(trajectory_spec.step_type,
                                 reward=trajectory_spec.reward,
                                 discount=trajectory_spec.discount,
                                 observation=trajectory_spec.observation)
    return (time_step_spec, policy_step_spec, time_step_spec)
Пример #19
0
    def testUpdateTarget(self):
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec, self._action_spec, self._categorical_net,
            self._optimizer)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)
        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, time_steps)

        loss_info = agent._loss(experience)
        update_targets = agent._update_target()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        losses = self.evaluate(loss_info).loss
        self.assertGreater(losses, 0.0)
        self.evaluate(update_targets)
Пример #20
0
  def testLoss(self, agent_class):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    agent = agent_class(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
    # [[1], [1]] from DummyNet above, we can calculate the following values:
    # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
    # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
    # (Here we use the second row of the kernel initializer above, since the
    # chosen action is now 1 instead of 0.)
    #
    # For target Q-values, action 0 produces a greater Q-value with a kernel of
    # [2, 1] instead of [1, 1] for action 1.
    # Target Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
    # Target Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
    # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7
    # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7
    # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5)
    # Overall loss: (19.8 + 32.2) / 2 = 26
    expected_loss = 26.0
    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllClose(self.evaluate(loss), expected_loss)
Пример #21
0
    def _distribution(self, time_step, policy_state):
        """Generates the distribution over next actions given the time_step.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: A Tensor, or a nested dict, list or tuple of
        Tensors representing the previous policy_state.

    Returns:
      A tfp.distributions.Categorical capturing the distribution of next
        actions.
      A policy_state Tensor, or a nested dict, list or tuple of Tensors,
        representing the new policy state.
    """
        network_observation = time_step.observation
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        if observation_and_action_constraint_splitter is not None:
            network_observation, mask = (
                observation_and_action_constraint_splitter(network_observation)
            )

        q_logits, policy_state = self._q_network(network_observation,
                                                 step_type=time_step.step_type,
                                                 network_state=policy_state)
        q_logits.shape.assert_has_rank(3)
        q_values = common.convert_q_logits_to_values(q_logits, self._support)

        logits = q_values

        if observation_and_action_constraint_splitter is not None:
            # Overwrite the logits for invalid actions to -inf.
            neg_inf = tf.constant(-np.inf, dtype=logits.dtype)
            logits = tf.compat.v2.where(tf.cast(mask, tf.bool), logits,
                                        neg_inf)

        action_spec = cast(tf.TensorSpec, self.action_spec)
        dist = tfp.distributions.Categorical(logits=logits,
                                             dtype=action_spec.dtype)

        return policy_step.PolicyStep(dist, policy_state)
    def _distribution(self, time_step, policy_state):
        observation = time_step.observation
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)
        if observation_and_action_constraint_splitter is not None:
            observation, mask = observation_and_action_constraint_splitter(
                observation)

        predictions, policy_state = self._reward_network(
            observation, time_step.step_type, policy_state)

        if isinstance(self._reward_network,
                      heteroscedastic_q_network.HeteroscedasticQNetwork):
            predicted_reward_values = predictions.q_value_logits
        else:
            predicted_reward_values = predictions

        predicted_reward_values.shape.with_rank_at_least(2)
        predicted_reward_values.shape.with_rank_at_most(3)
        if predicted_reward_values.shape[-1] != self._expected_num_actions:
            raise ValueError(
                'The number of actions ({}) does not match the reward_network output'
                ' size ({}.)'.format(self._expected_num_actions,
                                     predicted_reward_values.shape[1]))
        if observation_and_action_constraint_splitter is not None:
            actions = policy_utilities.masked_argmax(
                predicted_reward_values,
                mask,
                output_type=self.action_spec.dtype)
        else:
            actions = tf.argmax(predicted_reward_values,
                                axis=-1,
                                output_type=self.action_spec.dtype)
        actions += self._action_offset

        policy_info = policy_utilities.PolicyInfo(predicted_rewards_mean=(
            predicted_reward_values if policy_utilities.InfoFields.
            PREDICTED_REWARDS_MEAN in self._emit_policy_info else ()))

        return policy_step.PolicyStep(
            tfp.distributions.Deterministic(loc=actions), policy_state,
            policy_info)
 def testTrainPerArmAgentWithMask(self):
     num_actions = 4
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, num_actions)
     mask_obs_spec = (obs_spec,
                      tensor_spec.BoundedTensorSpec(shape=[num_actions],
                                                    minimum=0,
                                                    maximum=1,
                                                    dtype=tf.float32))
     time_step_spec = ts.time_step_spec(mask_obs_spec)
     reward_net = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         observation_and_action_constraint_splitter=lambda x: [x[0], x[1]],
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = ({
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32)
     }, tf.ones([2, num_actions]))
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
Пример #24
0
  def _distribution(self, time_step, policy_state):
    # Actor network outputs nested structure of distributions or actions.
    actions_or_distributions, policy_state = self._apply_actor_network(
        time_step, policy_state)

    def _to_distribution(action_or_distribution):
      if isinstance(action_or_distribution, tf.Tensor):
        # This is an action tensor, so wrap it in a deterministic distribution.
        return tfp.distributions.Deterministic(loc=action_or_distribution)
      return action_or_distribution

    distributions = tf.nest.map_structure(_to_distribution,
                                          actions_or_distributions)

    # Prepare policy_info.
    if self._collect:
      policy_info = ppo_utils.get_distribution_params(distributions)
    else:
      policy_info = ()

    return policy_step.PolicyStep(distributions, policy_state, policy_info)
Пример #25
0
def to_transition_spec(trajectory_spec: Trajectory) -> Transition:
  """Create a transition spec from a trajectory spec.

  Note: since trajectories do not include the policy step's state (except
  in special cases where the policy chooses to store this in the info field),
  the returned `transition.action_spec.state` field will be an empty tuple.

  Args:
    trajectory_spec: An instance of `Trajectory` representing trajectory specs.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)` specs.
  """
  policy_step_spec = policy_step.PolicyStep(
      action=trajectory_spec.action, state=(), info=trajectory_spec.policy_info)
  time_step_spec = ts.TimeStep(
      trajectory_spec.step_type,
      reward=trajectory_spec.reward,
      discount=trajectory_spec.discount,
      observation=trajectory_spec.observation)
  return Transition(time_step_spec, policy_step_spec, time_step_spec)
Пример #26
0
    def _action(self, time_step, policy_state, seed):
        observation = time_step.observation
        # Check the shape of the observation matrix.
        if not observation.shape.is_compatible_with(
            [None, self._observation_dim]):
            raise ValueError(
                'Observation shape is expected to be {}. Got {}.'.format(
                    [None, self._observation_dim],
                    observation.shape.as_list()))

        observation = tf.cast(observation, dtype=self._dtype)

        # Pass the observations through the encoding network.
        encoded_observation, _ = self._encoding_network(observation)

        chosen_actions = tf.cond(
            self._actions_from_reward_layer,
            lambda: self._get_actions_from_reward_layer(encoded_observation),
            lambda: self._get_actions_from_linucb(encoded_observation))

        return policy_step.PolicyStep(chosen_actions, policy_state)
Пример #27
0
 def _add_batch(self, time_steps, policy_steps, next_time_steps):
     for i in range(self.batch_size):
         ts = time_step.TimeStep(
             time_steps.step_type[i],
             time_steps.reward[i],
             time_steps.discount[i],
             time_steps.observation[i],
         )
         ps = policy_step.PolicyStep(
             policy_steps.action[i],
             policy_steps.state[i],
             (),
         )
         nts = time_step.TimeStep(
             next_time_steps.step_type[i],
             next_time_steps.reward[i],
             next_time_steps.discount[i],
             next_time_steps.observation[i],
         )
         traj = trajectory.from_transition(ts, ps, nts)
         self.writer(traj)
Пример #28
0
    def _create_replay_buffer(self, capacity=32):
        self._stack_count = 2
        self._single_shape = (1, )
        shape = (1, self._stack_count)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = capacity
        self._alpha = 0.6
        self._replay_buffer = PyPrioritizedReplayBuffer(
            data_spec=self._trajectory_spec,
            capacity=self._capacity,
            alpha=self._alpha)
Пример #29
0
    def _step(self, action):
        if self._done:
            return self.reset()

        a = action
        action = [[None for _ in g] for g in self._ready_state]
        for i, g in enumerate(self._ready_state):
            for j, s in enumerate(g):
                if s is not None:
                    action[i][j] = a
                    self._prev_actions[i][j] = policy_step.PolicyStep(action=tf.constant([a]))
        self._ready_state, reward, self._done, self._info = self._env.step(action)

        if self._done:
            # Arbitrarily return the first robot's reward
            return ts.termination(self._empty_observation, reward[0][0])

        for i, g in enumerate(self._ready_state):
            for j, s in enumerate(g):
                if s is not None:
                    return ts.transition(s, reward[i][j])
Пример #30
0
    def _action(self, time_step, policy_state, seed):
        observation = time_step.observation
        if self.observation_and_action_constraint_splitter is not None:
            observation, _ = self.observation_and_action_constraint_splitter(
                observation)
        mask = constraints.construct_mask_from_multiple_sources(
            time_step.observation,
            self._observation_and_action_constraint_splitter, (),
            self._num_actions)
        # Pass the observations through the encoding network.
        encoded_observation, _ = self._encoding_network(observation)
        encoded_observation = tf.cast(encoded_observation, dtype=self._dtype)

        if tf.distribute.has_strategy():
            if self._distributed_use_reward_layer:
                chosen_actions, est_mean_rewards, est_rewards_optimistic = (
                    self._get_actions_from_reward_layer(
                        encoded_observation, mask))
            else:
                chosen_actions, est_mean_rewards, est_rewards_optimistic = (
                    self._get_actions_from_linucb(encoded_observation, mask))
        else:
            chosen_actions, est_mean_rewards, est_rewards_optimistic = tf.cond(
                self._actions_from_reward_layer,
                # pylint: disable=g-long-lambda
                lambda: self._get_actions_from_reward_layer(
                    encoded_observation, mask),
                lambda: self._get_actions_from_linucb(encoded_observation, mask
                                                      ))

        arm_observations = ()
        if self._accepts_per_arm_features:
            arm_observations = observation[
                bandit_spec_utils.PER_ARM_FEATURE_KEY]
        policy_info = policy_utilities.populate_policy_info(
            arm_observations, chosen_actions, est_rewards_optimistic,
            est_mean_rewards, self._emit_policy_info,
            self._accepts_per_arm_features)
        return policy_step.PolicyStep(chosen_actions, policy_state,
                                      policy_info)