Пример #1
0
    def _relabel_given_goal(relabel_goal):
        obs_dim = relabel_goal.shape[0]
        all_trajectories = nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec)
        last_traj_idx = len(all_trajectories)
        for traj_idx, cur_trajectory in enumerate(all_trajectories):
            if cur_trajectory.step_type.numpy() != 2:
                new_obs = tf.concat(
                    [cur_trajectory.observation[:obs_dim], relabel_goal],
                    axis=0)

                if traj_idx == len(all_trajectories) - 1:
                    next_obs = tf.concat(
                        [last_step.observation[0, :obs_dim], relabel_goal],
                        axis=0)
                else:
                    next_obs = tf.concat([
                        all_trajectories[traj_idx + 1].observation[:obs_dim],
                        relabel_goal
                    ],
                                         axis=0)

                new_reward = tf.constant(reward_fn(obs=next_obs))

                # terminate episode
                if new_reward.numpy() > 0.0:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        next_step_type=tf.constant(2),
                        reward=new_reward,
                        discount=tf.constant(0., dtype=tf.float32))
                    last_traj_idx = traj_idx + 1
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))
                    break
                else:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        reward=new_reward,
                    )
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))

        if last_traj_idx == len(all_trajectories):
            last_observation = tf.concat(
                [last_step.observation[0, :obs_dim], relabel_goal], axis=0)
        else:
            last_observation = tf.concat([
                all_trajectories[last_traj_idx].observation[:obs_dim],
                relabel_goal
            ],
                                         axis=0)

        last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
            step_type=tf.constant(2),
            observation=last_observation,
            next_step_type=tf.constant(0),
            reward=tf.constant(0.0),
            discount=tf.constant(1., dtype=tf.float32))
        full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))
Пример #2
0
    def _train(self, experience, weights=None):
        del weights  # unused
        experience = self._as_trajectory(experience)

        reward, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        action, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observation, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        policy_choice, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.policy_info[mixture_policy.MIXTURE_AGENT_ID],
            self._time_step_spec.reward)
        original_infos, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.policy_info[mixture_policy.SUBPOLICY_INFO],
            self._original_info_spec)

        partitioned_nested_infos = nest_utils.batch_nested_tensors(
            _dynamic_partition_of_nested_tensors(original_infos, policy_choice,
                                                 self._num_agents))

        partitioned_nested_rewards = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                reward, policy_choice, self._num_agents)
        ]
        partitioned_nested_actions = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                action, policy_choice, self._num_agents)
        ]
        partitioned_nested_observations = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                observation, policy_choice, self._num_agents)
        ]
        loss = 0
        for k in range(self._num_agents):
            per_policy_experience = trajectory.single_step(
                observation=partitioned_nested_observations[k],
                action=partitioned_nested_actions[k],
                policy_info=partitioned_nested_infos[k],
                reward=partitioned_nested_rewards[k],
                discount=tf.zeros_like(partitioned_nested_rewards[k]))
            loss_info = self._agents[k].train(per_policy_experience)
            loss += loss_info.loss
        common.function_in_tf1()(self._update_mixture_distribution)(experience)
        return tf_agent.LossInfo(loss=(loss), extra=())
Пример #3
0
    def testTrainMaskingRewardMultipleEpisodesRewardOnLast(self):
        # Test that train reacts correctly to experience when there are:
        #   * Multiple MDP episodes
        #   * Rewards on the tf.StepType.LAST transitions
        #
        # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
        #
        # Experience looks like this:
        # Trajectories: (F, L) -> (L, F) -> (F, L) -> (L, F)
        # observation : [1, 2]    [1, 2]    [1, 2]    [1, 2]
        # action      :   [0]       [1]       [2]       [3]
        # reward      :    0         3         0         4
        # ~is_boundary:    1         0         1         0
        # is_last     :    1         0         1         0
        # valid reward:   0*1       3*0       0*1       4*0
        #
        # The second & fourth action & reward should be masked out due to being on a
        # boundary (step_type=(L, F)) transition.
        #
        # The expected_loss is = 0.0 in this case.
        agent = reinforce_agent.ReinforceAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=DummyActorNet(self._obs_spec,
                                        self._action_spec,
                                        unbounded_actions=True),
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            use_advantage_loss=False,
            normalize_returns=False,
        )

        step_type = tf.constant([
            ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST,
            ts.StepType.LAST
        ])
        next_step_type = tf.constant([
            ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST,
            ts.StepType.FIRST
        ])
        reward = tf.constant([0, 3, 0, 4], dtype=tf.float32)
        discount = tf.constant([1, 0, 1, 0], dtype=tf.float32)
        observations = tf.constant([[1, 2], [1, 2], [1, 2], [1, 2]],
                                   dtype=tf.float32)
        actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32)

        experience = nest_utils.batch_nested_tensors(
            trajectory.Trajectory(step_type, observations, actions, (),
                                  next_step_type, reward, discount))

        # Rewards on the StepType.LAST should be counted.
        expected_loss = 0.0

        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = self.evaluate(loss)
        self.assertAllClose(loss_info.loss, expected_loss)
Пример #4
0
    def _distribution(self, time_step, policy_state):
        batched = nest_utils.is_batched_nested_tensors(time_step,
                                                       self._time_step_spec)
        if not batched:
            time_step = nest_utils.batch_nested_tensors(time_step)

        policy_dist_step = self._wrapped_policy.distribution(
            time_step, policy_state)
        policy_state = policy_dist_step.state
        policy_info = policy_dist_step.info
        policy_logits = policy_dist_step.action.logits_parameter()
        action_size = tf.shape(policy_logits)[-1]

        greedy_probs = tf.one_hot(tf.argmax(policy_logits, -1), action_size)
        uniform_probs = (tf.ones(tf.shape(policy_logits)) /
                         tf.cast(action_size, tf.float32))
        epsilon = self._get_epsilon()
        mixed_probs = (1 - epsilon) * greedy_probs + epsilon * uniform_probs
        if not batched:
            mixed_probs = tf.squeeze(mixed_probs, 0)
            policy_state = nest_utils.unbatch_nested_tensors(policy_state)
            policy_info = nest_utils.unbatch_nested_tensors(policy_info)
        mixed_dist = tfp.distributions.Categorical(
            probs=mixed_probs, dtype=policy_dist_step.action.dtype)

        return policy_step.PolicyStep(mixed_dist, policy_state, policy_info)
Пример #5
0
    def _action(self, time_step, policy_state, seed):
        del seed

        def _mode(dist, spec):
            action = dist.mode()
            return tf.reshape(action, [
                -1,
            ] + spec.shape.as_list())

        # TODO(oars): Remove batched data checks when tf_env is batched.
        time_step_batched = nest_utils.is_batched_nested_tensors(
            time_step, self._time_step_spec)
        if not time_step_batched:
            time_step = nest_utils.batch_nested_tensors(
                time_step, self._time_step_spec)

        distribution_step = self._wrapped_policy.distribution(
            time_step, policy_state)
        actions = nest.map_structure(_mode, distribution_step.action,
                                     self._action_spec)

        if not time_step_batched:
            actions = nest_utils.unbatch_nested_tensors(
                actions, self._action_spec)
        return policy_step.PolicyStep(actions, distribution_step.state,
                                      distribution_step.info)
Пример #6
0
    def testBatchedSingleTensor(self):
        tensor = tf.zeros([5, 2, 3], dtype=tf.float32)
        spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32)

        batched_tensor = nest_utils.batch_nested_tensors(tensor, spec)

        self.assertEqual(batched_tensor.shape.as_list(), [5, 2, 3])
Пример #7
0
 def get_passable(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_tensors(
             tf.cast(self._envs[0].passable, tf.float32))
     else:
         return tf.stack(lambda env: tf.cast(env.passable, tf.float32),
                         self._envs)
Пример #8
0
    def _action(self, time_step, policy_state, seed):
        if seed is not None:
            raise NotImplementedError(
                'seed is not supported; but saw seed: {}'.format(seed))

        def _action_fn(*flattened_time_step_and_policy_state):
            packed_py_time_step, packed_py_policy_state = tf.nest.pack_sequence_as(
                structure=(self._py_policy.time_step_spec,
                           self._py_policy.policy_state_spec),
                flat_sequence=flattened_time_step_and_policy_state)
            py_action_step = self._py_policy.action(
                time_step=packed_py_time_step,
                policy_state=packed_py_policy_state)
            return tf.nest.flatten(py_action_step)

        with tf.name_scope('action'):
            flattened_input_tensors = tf.nest.flatten(
                (nest_utils.unbatch_nested_tensors(time_step), policy_state))

            flat_action_step = tf.compat.v1.py_func(_action_fn,
                                                    flattened_input_tensors,
                                                    self._policy_step_dtypes,
                                                    stateful=True,
                                                    name='action_py_func')
            action_step = tf.nest.pack_sequence_as(
                structure=self.policy_step_spec,
                flat_sequence=flat_action_step)
            return action_step._replace(
                action=nest_utils.batch_nested_tensors(action_step.action))
 def get_distance_to_goal(self):
   if self._num_envs == 1:
     return nest_utils.batch_nested_tensors(
         tf.cast(self._envs[0].distance_to_goal, tf.float32))
   else:
     return tf.stack(
         lambda env: tf.cast(env.distance_to_goal, tf.float32), self._envs)
 def get_num_blocks(self):
   if self._num_envs == 1:
     return nest_utils.batch_nested_tensors(
         tf.cast(self._envs[0].n_clutter_placed, tf.float32))
   else:
     return tf.stack(
         lambda env: tf.cast(env.n_clutter_placed, tf.float32), self._envs)
Пример #11
0
  def testTrainMaskingPartialEpisodeMultipleEpisodesRewardOnFirst(self):
    # Test that train reacts correctly to experience when there are:
    #   * Multiple MDP episodes
    #   * Rewards on the tf.StepType.FIRST transitions
    #   * Partial episode at end of experience
    #
    # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
    #
    # Experience looks like this:
    # Trajectories: (F, L) -> (L, F) -> (F, M) -> (M, M)
    # observation : [1, 2]    [1, 2]    [1, 2]    [1, 2]
    # action      :   [0]       [1]       [2]       [3]
    # reward      :    3         0         4         0
    # ~is_boundary:    1         0         1         1
    # is_last     :    1         0         0         0
    # valid reward:   3*1       0*0       4*0       0*0
    #
    # The second action & reward should be masked out due to being on a
    # boundary (step_type=(L, F)) transition.  The third & fourth transitions
    # should get masked out for everything due to it being an incomplete episode
    # (notice there is no trailing step_type=(F,L)).
    #
    # The expected_loss is > 0.0 in this case, matching the expected_loss of the
    # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test,
    # because the partial second episode should be masked out.
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=True),
        optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        use_advantage_loss=False,
        normalize_returns=False,
    )

    step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST,
                             ts.StepType.FIRST, ts.StepType.MID])
    next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST,
                                  ts.StepType.MID, ts.StepType.MID])
    reward = tf.constant([3, 0, 4, 0], dtype=tf.float32)
    discount = tf.constant([1, 0, 1, 0], dtype=tf.float32)
    observations = tf.constant(
        [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32)
    actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32)

    experience = nest_utils.batch_nested_tensors(trajectory.Trajectory(
        step_type, observations, actions, (), next_step_type, reward, discount))

    # Rewards on the StepType.FIRST should be counted.
    expected_loss = 10.8935775757

    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_info = self.evaluate(loss)
    self.assertAllClose(loss_info.loss, expected_loss)
Пример #12
0
 def get_deliberate_placement(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_tensors(
             tf.cast(self._envs[0].deliberate_agent_placement, tf.float32))
     else:
         return tf.stack(
             lambda env: tf.cast(env.deliberate_agent_placement, tf.float32
                                 ), self._envs)
Пример #13
0
 def get_shortest_path_length(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_tensors(
             tf.cast(self._envs[0].shortest_path_length, tf.float32))
     else:
         return tf.stack(
             lambda env: tf.cast(env.shortest_path_length, tf.float32),
             self._envs)
Пример #14
0
    def testTrainMaskingRewardMultipleBanditEpisodes(self):
        # Test that train reacts correctly to experience when there are multiple
        # Bandit episodes.  Bandit episodes are encoded differently than
        # MDP episodes.  They (each) have only a single transition with
        # step_type=StepType.FIRST and next_step_type=StepType.LAST.  This test
        # helps ensure that LAST->FIRST->LAST transitions are handled correctly.
        #
        # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
        #
        # Experience looks like this:
        # Trajectories: (F, L) -> (F, L)
        # observation : [1, 2]    [1, 2]
        # action      :   [0]       [2]
        # reward      :    3         4
        # ~is_boundary:    0         0
        # is_last     :    1         1
        # valid reward:   3*1       4*1
        #
        # All bandit transitions are valid and none are masked.
        #
        # The expected_loss is > 0.0 in this case, matching the expected_loss of the
        # testMaskingRewardMultipleEpisodesRewardOnFirst policy_gradient_loss test.
        agent = reinforce_agent.ReinforceAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=DummyActorNet(self._obs_spec,
                                        self._action_spec,
                                        unbounded_actions=True),
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            use_advantage_loss=False,
            normalize_returns=False,
        )

        step_type = tf.constant([ts.StepType.FIRST, ts.StepType.FIRST])
        next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.LAST])
        reward = tf.constant([3, 4], dtype=tf.float32)
        discount = tf.constant([0, 0], dtype=tf.float32)
        observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32)
        actions = tf.constant([[0], [2]], dtype=tf.float32)

        experience = nest_utils.batch_nested_tensors(
            trajectory.Trajectory(step_type, observations, actions, (),
                                  next_step_type, reward, discount))

        # Rewards on the StepType.FIRST should be counted.
        expected_loss = 12.2091741562

        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = self.evaluate(loss)
        self.assertAllClose(loss_info.loss, expected_loss)
Пример #15
0
  def testBatchNestedTensors(self):
    shape = [2, 3]
    batch_shape = [1] + shape
    specs = self.nest_spec(shape)
    tensors = self.zeros_from_spec(specs)
    tf.nest.assert_same_structure(tensors, specs)

    batched_tensors = nest_utils.batch_nested_tensors(tensors, specs)

    tf.nest.assert_same_structure(specs, batched_tensors)
    assert_shapes = lambda t: self.assertEqual(t.shape.as_list(), batch_shape)
    tf.nest.map_structure(assert_shapes, batched_tensors)
Пример #16
0
  def testRandomPyPolicyGeneratesActionTensors(self):
    array_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10)
    observation = tf.ones([3], tf.float32)
    time_step = ts.restart(observation)

    observation_spec = tensor_spec.TensorSpec.from_tensor(observation)
    time_step_spec = ts.time_step_spec(observation_spec)

    tf_py_random_policy = tf_py_policy.TFPyPolicy(
        random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec,
                                        action_spec=array_action_spec))

    batched_time_step = nest_utils.batch_nested_tensors(time_step)
    action_step = tf_py_random_policy.action(time_step=batched_time_step)
    action, new_policy_state = self.evaluate(
        [action_step.action, action_step.state])

    self.assertEqual((1,) + array_action_spec.shape, action.shape)
    self.assertTrue(np.all(action >= array_action_spec.minimum))
    self.assertTrue(np.all(action <= array_action_spec.maximum))
    self.assertEqual(new_policy_state, ())
Пример #17
0
    def _apply_actor_network(self,
                             time_step,
                             step_type,
                             policy_state,
                             mask=None):
        observation = time_step

        if self._observation_normalizer:
            observation = self._observation_normalizer.normalize(observation)
        if tf.is_tensor(observation):
            if not nest_utils.is_batched_nested_tensors(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_tensors(observation)
        else:
            if not nest_utils.get_outer_array_shape(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_array(observation)

        alpha = np.array([self.alpha])[None]
        return self._actor_network((observation, alpha),
                                   step_type,
                                   policy_state,
                                   training=self._training)
Пример #18
0
    def _distribution(self, time_step, policy_state):
        batched = nest_utils.is_batched_nested_tensors(time_step,
                                                       self._time_step_spec)
        if not batched:
            time_step = nest_utils.batch_nested_tensors(time_step)

        policy_dist_step = self._wrapped_policy.distribution(
            time_step, policy_state)
        policy_state = policy_dist_step.state
        policy_mean_action = policy_dist_step.action.mean()
        policy_info = policy_dist_step.info

        if not batched:
            policy_state = nest_utils.unbatch_nested_tensors(policy_state)
            policy_mean_action = nest_utils.unbatch_nested_tensors(
                policy_mean_action)
            policy_info = nest_utils.unbatch_nested_tensors(policy_info)

        gaussian_dist = tfp.distributions.MultivariateNormalDiag(
            loc=policy_mean_action,
            scale_diag=tf.ones_like(policy_mean_action) * self._scale)

        return policy_step.PolicyStep(gaussian_dist, policy_state, policy_info)
Пример #19
0
def relabel_function(cur_episode, last_step, reward_fn, full_buffer):
    all_data = cur_episode.gather_all()

    # add all actual interaction to the replay buffer
    all_data = nest_utils.unbatch_nested_tensors(all_data)
    for cur_trajectory in nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec):
        # was already added by previous iteration
        if cur_trajectory.step_type.numpy() != 2:
            full_buffer.add_batch(
                nest_utils.batch_nested_tensors(cur_trajectory))

    last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
        step_type=tf.constant(2),
        observation=last_step.observation[0],
        next_step_type=tf.constant(0),
        reward=tf.constant(0.0),
        discount=tf.constant(1., dtype=tf.float32))
    full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))

    def _relabel_given_goal(relabel_goal):
        obs_dim = relabel_goal.shape[0]
        all_trajectories = nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec)
        last_traj_idx = len(all_trajectories)
        for traj_idx, cur_trajectory in enumerate(all_trajectories):
            if cur_trajectory.step_type.numpy() != 2:
                new_obs = tf.concat(
                    [cur_trajectory.observation[:obs_dim], relabel_goal],
                    axis=0)

                if traj_idx == len(all_trajectories) - 1:
                    next_obs = tf.concat(
                        [last_step.observation[0, :obs_dim], relabel_goal],
                        axis=0)
                else:
                    next_obs = tf.concat([
                        all_trajectories[traj_idx + 1].observation[:obs_dim],
                        relabel_goal
                    ],
                                         axis=0)

                new_reward = tf.constant(reward_fn(obs=next_obs))

                # terminate episode
                if new_reward.numpy() > 0.0:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        next_step_type=tf.constant(2),
                        reward=new_reward,
                        discount=tf.constant(0., dtype=tf.float32))
                    last_traj_idx = traj_idx + 1
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))
                    break
                else:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        reward=new_reward,
                    )
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))

        if last_traj_idx == len(all_trajectories):
            last_observation = tf.concat(
                [last_step.observation[0, :obs_dim], relabel_goal], axis=0)
        else:
            last_observation = tf.concat([
                all_trajectories[last_traj_idx].observation[:obs_dim],
                relabel_goal
            ],
                                         axis=0)

        last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
            step_type=tf.constant(2),
            observation=last_observation,
            next_step_type=tf.constant(0),
            reward=tf.constant(0.0),
            discount=tf.constant(1., dtype=tf.float32))
        full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))

    # relabel with last time step achieved in the episode
    if FLAGS.goal_relabel_type == 0 or (FLAGS.goal_relabel_type == 1
                                        and last_step.reward.numpy()[0] <= 0.):
        obs_dim = last_step.observation.shape[1] // 2
        _relabel_given_goal(last_step.observation[0, :obs_dim])

    elif FLAGS.goal_relabel_type == 2 and last_step.reward.numpy()[0] <= 0.:
        goals = [
            [1.2, 0., 2.5, 0., -1., -1.],
            [2., 0., 2.4, 0., 0., 0.],
            [0.8, 0., 1.2, 0., 0., 0.],
            [-0.1, -0.3, 0.3, -0.3, 0., 0.],
            [-0.6, -1., -0.2, -1., 0., 0.],
            [-1.8, -1., -1.4, -1., 0., 0.],
            [-2.8, -0.8, -2.4, -1., -1., -1.],
            [-2.4, 0., -2.4, -1., -1., -1.],
            [-1.2, 0., -2.4, -1., -1., -1.],
            [0.0, 0.0, -2.5, -1, -1., -1.],
        ]
        goals = np.stack(goals).astype('float32')
        print('unrelabelled goal:', last_step.observation[0, 6:].numpy())
        relabel_goal_idxs = np.arange(goals.shape[0])
        np.random.shuffle(relabel_goal_idxs)
        obs_dim = last_step.observation.shape[1] // 2

        relabel_count = 0
        for goal_idx in relabel_goal_idxs:
            chosen_goal = goals[goal_idx]
            if (chosen_goal == last_step.observation[0,
                                                     obs_dim:].numpy()).all():
                continue
            print('goal for relabelling:', chosen_goal)
            _relabel_given_goal(relabel_goal=tf.constant(chosen_goal))

            relabel_count += 1
            if relabel_count >= FLAGS.num_relabelled_goals:
                break

    else:
        print('not adding relabelled trajectories')
 def decode_and_batch_fn(proto):
     """Decodes a proto object, and batch output tensors."""
     sample = decoder(proto)
     return nest_utils.batch_nested_tensors(sample)
Пример #21
0
    def testWrongShapeRaisesValueError(self):
        tensor = tf.zeros([3, 3], dtype=tf.float32)
        spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32)

        with self.assertRaises(ValueError):
            nest_utils.batch_nested_tensors(tensor, spec)
Пример #22
0
def data_multiplier(offline_data, reward_fn):
    def _custom_print(some_traj):  # pylint: disable=unused-variable
        np.set_printoptions(precision=2, suppress=True)
        print('step', some_traj.step_type.numpy(), 'obs',
              some_traj.observation.numpy(),
              'action', some_traj.action.numpy(), 'reward',
              some_traj.reward.numpy(), 'next_step',
              some_traj.next_step_type.numpy(), 'discount',
              some_traj.discount.numpy())

    all_data = nest_utils.unbatch_nested_tensors(offline_data.gather_all())
    all_trajs = nest_utils.unstack_nested_tensors(all_data,
                                                  offline_data.data_spec)

    for idx, traj in enumerate(all_trajs):
        # print('index:', idx)
        if traj.step_type.numpy() == 0:
            ep_start_idx = idx
            # print('new start index:', ep_start_idx)
        # TODO(architsh): remove this and change to else:
        # elif idx in [12, 24, 36, 48, 60, 72, 84, 96, 108]:
        else:
            # print('adding new trajectory')
            obs_dim = traj.observation.shape[0] // 2
            relabel_goal = traj.observation[:obs_dim]
            # print('new goal:', relabel_goal)

            last_traj_idx = len(all_trajs[ep_start_idx:idx + 1])
            for traj_idx, cur_trajectory in enumerate(
                    all_trajs[ep_start_idx:idx + 1]):
                if cur_trajectory.step_type.numpy() != 2:
                    new_obs = tf.concat(
                        [cur_trajectory.observation[:obs_dim], relabel_goal],
                        axis=0)

                    next_obs = tf.concat([
                        all_trajs[ep_start_idx + traj_idx +
                                  1].observation[:obs_dim], relabel_goal
                    ],
                                         axis=0)

                    new_reward = tf.constant(reward_fn(obs=next_obs))
                    # terminate episode
                    if new_reward.numpy() > 0.0:
                        new_traj = cur_trajectory._replace(
                            observation=new_obs,
                            next_step_type=tf.constant(2),
                            reward=new_reward,
                            discount=tf.constant(0., dtype=tf.float32))
                        last_traj_idx = ep_start_idx + traj_idx + 1
                        # _custom_print(new_traj)
                        offline_data.add_batch(
                            nest_utils.batch_nested_tensors(new_traj))
                        break
                    else:
                        new_traj = cur_trajectory._replace(
                            observation=new_obs,
                            reward=new_reward,
                        )
                        # _custom_print(new_traj)
                        offline_data.add_batch(
                            nest_utils.batch_nested_tensors(new_traj))

            last_observation = tf.concat(
                [all_trajs[last_traj_idx].observation[:obs_dim], relabel_goal],
                axis=0)
            last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
                step_type=tf.constant(2),
                observation=last_observation,
                next_step_type=tf.constant(0),
                reward=tf.constant(0.0),
                discount=tf.constant(1., dtype=tf.float32))
            # _custom_print(last_traj)
            offline_data.add_batch(nest_utils.batch_nested_tensors(last_traj))
Пример #23
0
def copy_replay_buffer(small_buffer, big_buffer):
    """Copy small buffer into the big buffer."""
    all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all())
    for trajectory in nest_utils.unstack_nested_tensors(
            all_data, big_buffer.data_spec):
        big_buffer.add_batch(nest_utils.batch_nested_tensors(trajectory))
Пример #24
0
  def testTrainMaskingRewardSingleEpisodeRewardOnFirst(self):
    # Test that train reacts correctly to experience when there are:
    #   * A single MDP episode
    #   * Rewards on the tf.StepType.FIRST transitions
    #
    # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
    #
    # Experience looks like this:
    # Trajectories: (F, L) -> (L, F)
    # observation : [1, 2]    [1, 2]
    # action      :   [0]       [1]
    # reward      :    3         4
    # ~is_boundary:    1         0
    # is_last     :    1         0
    # valid reward:   3*1       4*0
    #
    # The second action & reward should be masked out due to being on a
    # boundary (step_type=(L, F)) transition.
    #
    # The expected_loss is > 0.0 in this case, matching the expected_loss of the
    # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test.
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=True),
        optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        use_advantage_loss=False,
        normalize_returns=False,
    )

    step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST])
    next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST])
    reward = tf.constant([3, 4], dtype=tf.float32)
    discount = tf.constant([1, 0], dtype=tf.float32)
    observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32)
    actions = tf.constant([[0], [1]], dtype=tf.float32)

    experience = nest_utils.batch_nested_tensors(trajectory.Trajectory(
        step_type, observations, actions, (), next_step_type, reward, discount))

    # Rewards on the StepType.FIRST should be counted.
    expected_loss = 10.8935775757
    expected_policy_gradient_loss = 10.8935775757
    expected_policy_network_regularization_loss = 0
    expected_entropy_regularization_loss = 0
    expected_value_estimation_loss = 0
    expected_value_network_regularization_loss = 0

    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_info = self.evaluate(loss)
    self.assertAllClose(loss_info.loss, expected_loss)
    self.assertAllClose(loss_info.extra.policy_gradient_loss,
                        expected_policy_gradient_loss)
    self.assertAllClose(loss_info.extra.policy_network_regularization_loss,
                        expected_policy_network_regularization_loss)
    self.assertAllClose(loss_info.extra.entropy_regularization_loss,
                        expected_entropy_regularization_loss)
    self.assertAllClose(loss_info.extra.value_estimation_loss,
                        expected_value_estimation_loss)
    self.assertAllClose(loss_info.extra.value_network_regularization_loss,
                        expected_value_network_regularization_loss)
    def _get_step(self) -> EnvStep:
        if self._start_on_next_step:
            self._start_new_episode()

        if StepType.is_last(self._step_type):
            # This is the last (terminating) observation of the environment.
            self._start_on_next_step = True
            self._num_total_steps += 1
            self._num_episodes += 1
            # The policy is not run on the terminal step, so we just carry over the
            # reward, action, and policy_info from the previous step.
            return EnvStep(self._step_type,
                           tf.cast(self._cur_step_num, dtype=tf.int64),
                           self._time_step.observation, self._action,
                           self._time_step.reward, self._time_step.discount,
                           self._policy_info, {}, {})

        self._action, self._policy_state, self._policy_info = self._policy.action(
            self._time_step, self._policy_state)

        # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents.
        if hasattr(self._policy_info, 'log_probability'):
            self._policy_info = policy_step.set_log_probability(
                self._policy_info,
                tf.cast(self._policy_info.log_probability, tf.float32))

        # Sample action from policy.
        env_action = self._action
        if self._env.batch_size is not None:
            env_action = nest_utils.batch_nested_tensors(env_action)

        # Sample next step from environment.
        self._next_time_step = self._env.step(env_action)
        if self._env.batch_size is not None:
            self._next_time_step = nest_utils.unbatch_nested_tensors(
                self._next_time_step)
        self._next_step_type = self._next_time_step.step_type
        self._cur_step_num += 1
        if (self._episode_step_limit
                and self._cur_step_num >= self._episode_step_limit):
            self._next_step_type = tf.convert_to_tensor(  # Overwrite step type.
                value=StepType.LAST,
                dtype=self._first_step_type.dtype)
            self._next_step_type = tf.reshape(self._next_step_type,
                                              tf.shape(self._first_step_type))

        step = EnvStep(
            self._step_type,
            tf.cast(self._cur_step_num - 1, tf.int64),
            self._time_step.observation,
            self._action,
            # Immediate reward given by next time step.
            self._next_time_step.reward,
            self._time_step.discount,
            self._policy_info,
            {},
            {})

        self._num_steps += 1
        self._num_total_steps += 1
        if StepType.is_first(self._step_type):
            self._num_total_episodes += 1

        self._time_step = self._next_time_step
        self._step_type = self._next_step_type

        return step