Exemplo n.º 1
0
    def step(self, action):
        """Updates the environment using the action and returns a `TimeStep`."""

        if self._reset_next_step:
            return self.reset()

        self._task.before_step(action, self._physics)
        for _ in range(self._n_sub_steps):
            self._physics.step()
        self._task.after_step(self._physics)

        reward = self._task.get_reward(self._physics)
        observation = self._task.get_observation(self._physics)
        if self._flat_observation:
            observation = flatten_observation(observation)

        self._step_count += 1
        if self._step_count >= self._step_limit:
            discount = 1.0
        else:
            discount = self._task.get_termination(self._physics)

        episode_over = discount is not None

        if episode_over:
            self._reset_next_step = True
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            return dm_env.TimeStep(dm_env.StepType.MID, reward, 1.0,
                                   observation)
Exemplo n.º 2
0
    def initial_step(self, action):
        if self._reset_next_step:
            return self.reset()
        for _ in range(self._n_sub_steps * self._n_frame_skip):
            self._physics.step()
        self._task.after_step(self._physics)
        reward = self._task.get_reward(self._physics)
        observation = self._task.get_observation(self._physics)
        if self._flat_observation:
            observation = flatten_observation(observation)

        if self._step_count >= self._step_limit:
            discount = 1.0
        else:
            discount = self._task.get_termination(self._physics)
        episode_over = discount is not None

        if episode_over:
            self._reset_next_step = True
            return dm_env.TimeStep(
                dm_env.StepType.LAST, reward, discount, observation, dict()
            )
        else:
            return dm_env.TimeStep(
                dm_env.StepType.MID, reward, 1.0, observation, dict()
            )
    def step(self, action):
        """Updates the environment using the action and returns a `TimeStep`."""

        if self._reset_next_step:
            return self.reset()

        self._task.before_step(action, self._physics)
        for _ in range(self._n_sub_steps):
            self._physics.step()
        self._task.after_step(self._physics)

        reward = self._task.get_reward(self._physics)
        observation = self._task.get_observation(self._physics)
        if self._flat_observation:
            observation = flatten_observation(observation)

        # Added Code for Elias Viewing
        k = self._step_count

        self._physics.named.data.qpos['slider_1'] = self.data[k][0][0]
        self._physics.named.data.qpos['hinge_1_1'] = self.data[k][0][1]

        self._physics.named.data.qpos['slider_2'] = self.data[k][1][0]
        self._physics.named.data.qpos['hinge_1_2'] = self.data[k][1][1]

        self._physics.named.data.qpos['slider_3'] = self.data[k][2][0]
        self._physics.named.data.qpos['hinge_1_3'] = self.data[k][2][1]

        self._physics.named.data.qpos['slider_4'] = self.data[k][3][0]
        self._physics.named.data.qpos['hinge_1_4'] = self.data[k][3][1]

        self._physics.named.data.qpos['slider_5'] = self.data[k][4][0]
        self._physics.named.data.qpos['hinge_1_5'] = self.data[k][4][1]

        self._physics.named.data.qpos['slider_6'] = self.data[k][5][0]
        self._physics.named.data.qpos['hinge_1_6'] = self.data[k][5][1]

        self._physics.named.data.qpos['slider_7'] = self.data[k][6][0]
        self._physics.named.data.qpos['hinge_1_7'] = self.data[k][6][1]

        self._physics.named.data.qpos['slider_8'] = self.data[k][6][0]
        self._physics.named.data.qpos['hinge_1_8'] = self.data[k][6][1]

        self._step_count += 1
        if self._step_count >= self._step_limit:
            discount = 1.0
        else:
            discount = self._task.get_termination(self._physics)

        episode_over = discount is not None

        if episode_over:
            self._reset_next_step = True
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            return dm_env.TimeStep(dm_env.StepType.MID, reward, 1.0,
                                   observation)
Exemplo n.º 4
0
def _prefill_with_demonstrations(adder: adders.Adder,
                                 demonstrations: Sequence[types.Transition],
                                 reward: Optional[float],
                                 min_num_transitions: int = 0) -> None:
    """Fill the adder's replay buffer with expert transitions.

  Assumes that the demonstrations dataset stores transitions in order.

  Args:
    adder: the agent which adds the demonstrations.
    demonstrations: the expert demonstrations to iterate over.
    reward: if non-None, populates the environment reward entry of transitions.
    min_num_transitions: the lower bound on transitions processed, the dataset
      will be iterated over multiple times if needed. Once at least
      min_num_transitions are added, the processing is interrupted at the
      nearest episode end.
  """
    if not demonstrations:
        return

    reward = np.float32(reward) if reward is not None else reward
    remaining_transitions = min_num_transitions
    step_type = None
    action = None
    ts = dm_env.TimeStep(None, None, None, None)  # Unused.
    while remaining_transitions > 0:
        # In case we share the adder or demonstrations don't end with
        # end-of-episode, reset the adder prior to add_first.
        adder.reset()
        for transition_num, transition in enumerate(demonstrations):
            remaining_transitions -= 1
            discount = np.float32(1.0)
            ts_reward = reward if reward is not None else transition.reward
            if step_type == dm_env.StepType.LAST or transition_num == 0:
                ts = dm_env.TimeStep(dm_env.StepType.FIRST, ts_reward,
                                     discount, transition.observation)
                adder.add_first(ts)

            observation = transition.next_observation
            action = transition.action
            if transition.discount == 0. or transition_num == len(
                    demonstrations) - 1:
                step_type = dm_env.StepType.LAST
                discount = np.float32(0.0)
            else:
                step_type = dm_env.StepType.MID
            ts = dm_env.TimeStep(step_type, ts_reward, discount, observation)
            adder.add(action, ts)
            if remaining_transitions <= 0:
                # Note: we could check `step_type == dm_env.StepType.LAST` to stop at an
                # episode end if possible.
                break

    # Explicitly finalize the Reverb client writes.
    adder.reset()
Exemplo n.º 5
0
    def step(self, action):
        """Updates the environment using the action and returns a `TimeStep`."""
        if self._reset_next_step:
            self._reset_next_step = False
            return self.reset()

        self._hooks.before_step(self._physics_proxy, action,
                                self._random_state)
        self._observation_updater.prepare_for_next_control_step()

        try:
            for i in range(self._n_sub_steps):
                self._hooks.before_substep(self._physics_proxy, action,
                                           self._random_state)
                self._physics.step()
                self._hooks.after_substep(self._physics_proxy,
                                          self._random_state)
                # The final observation update must happen after all the hooks in
                # `self._hooks.after_step` is called. Otherwise, if any of these hooks
                # modify the physics state then we might capture an observation that is
                # inconsistent with the final physics state.
                if i < self._n_sub_steps - 1:
                    self._observation_updater.update()
            physics_is_divergent = False
        except control.PhysicsError as e:
            if not self._raise_exception_on_physics_error:
                logging.warning(e)
                physics_is_divergent = True
            else:
                raise

        self._hooks.after_step(self._physics_proxy, self._random_state)
        self._observation_updater.update()

        if not physics_is_divergent:
            reward = self._task.get_reward(self._physics_proxy)
            discount = self._task.get_discount(self._physics_proxy)
            terminating = (self._task.should_terminate_episode(
                self._physics_proxy)
                           or self._physics.time() >= self._time_limit)
        else:
            reward = 0.0
            discount = 0.0
            terminating = True

        obs = self._observation_updater.get_observation()

        if not terminating:
            return dm_env.TimeStep(dm_env.StepType.MID, reward, discount, obs)
        else:
            self._reset_next_step = True
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, obs)
def test_dqn():
    n_actions = 10
    in_shape = (4, 84, 84)
    hparams = HParams()
    agent = DQN(n_actions, in_shape, hparams)
    r = 0
    x = jax.random.normal(agent.rng, (84, 84, 3))
    timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r,
                               agent.hparams.discount, x)
    action = agent.select_action(timestep)
    y = jax.random.normal(agent.rng, (84, 84, 3))
    new_timestep = dm_env.TimeStep(dm_env.StepType.MID, 1.0,
                                   agent.hparams.discount, y)
    agent.update(timestep, action, new_timestep)
    return agent
Exemplo n.º 7
0
    def step(self, action: np.int32) -> dm_env.TimeStep:
        """Updates the environment given an action and returns a timestep."""
        # If the previous timestep was LAST then we call reset() on the Gym
        # environment, otherwise step(). Although Gym environments allow you to step
        # through episode boundaries (similar to dm_env) they emit a warning.
        if self._start_of_episode:
            step_type = dm_env.StepType.FIRST
            observation = self._key_door_env.reset_environment()
            discount = None
            reward = None
            done = False
        else:
            reward, observation = self._key_door_env.step(action)
            done = not self._key_door_env.active
            info = ""
            if done:
                assert "TimeLimit.truncated" not in info, "Should never truncate."
                step_type = dm_env.StepType.LAST
                discount = 0.0
            else:
                step_type = dm_env.StepType.MID
                discount = 1.0

        lives = np.int32(1)
        timestep = dm_env.TimeStep(
            step_type=step_type,
            observation=(observation, lives),
            reward=reward,
            discount=discount,
        )
        self._start_of_episode = done
        return timestep
    def step(self, action):
        """Apply action, step the world forward, and return observations."""
        # If needed, reset and start new episode.
        if self._state == dm_env.StepType.LAST:
            self._clear_state()
        if self._current_game is None:
            return self.reset()

        # Execute the action in pycolab.
        observation, reward, discount = self._current_game.play(action)

        self._game_over = self._is_game_over()
        reward = reward if reward is not None else 0.
        observation = self._render_observation(observation)

        # Check the current status of the game.
        if self._game_over:
            self._state = dm_env.StepType.LAST
        else:
            self._state = dm_env.StepType.MID

        return dm_env.TimeStep(step_type=self._state,
                               reward=reward,
                               discount=discount,
                               observation=observation)
Exemplo n.º 9
0
    def step(self, action: t.Tuple[np.int8, np.int8]) \
            -> dm_env.TimeStep:
        """
        Update the environment given an action.

        Returns
        -------
        dm_env.TimeStep
            Timestep resulting from the action.
        """
        # If the previous timestep was LAST then call reset() on the Simulator
        # environment to launch a new episode, otherwise step().
        if self._start_of_episode:
            observation = self._sim.reset(self._sim_input, False)

            discount = None
            finished = False
            reward = None
            step_type = dm_env.StepType.FIRST
        else:
            observation, reward, finished, _ = self._sim.step(action)

            step_type = dm_env.StepType.LAST \
                if finished                  \
                else dm_env.StepType.MID

            # TODO: Choose discount value
            discount = tuple([float(step_type == dm_env.StepType.MID)] * 2)

        self._start_of_episode = finished

        return dm_env.TimeStep(discount=discount,
                               observation=observation,
                               reward=reward,
                               step_type=step_type)
Exemplo n.º 10
0
def convert_seq_timestep_and_actions_to_parallel(
        timesteps: Dict[str, SeqTimestepDict],
        possible_agents: list) -> Tuple[dict, dm_env.TimeStep]:

    step_types = [
        timesteps[agent]["timestep"].step_type for agent in possible_agents
    ]
    assert all(
        x == step_types[0]
        for x in step_types), f"Step types should be identical - {step_types} "
    parallel_timestep = dm_env.TimeStep(
        observation={
            agent: timesteps[agent]["timestep"].observation
            for agent in possible_agents
        },
        reward={
            agent: timesteps[agent]["timestep"].reward
            for agent in possible_agents
        },
        discount={
            agent: timesteps[agent]["timestep"].discount
            for agent in possible_agents
        },
        step_type=step_types[0],
    )

    parallel_actions = {
        agent: timesteps[agent]["action"]
        for agent in possible_agents
    }

    return parallel_actions, parallel_timestep
Exemplo n.º 11
0
  def step(self, actions):
    """Implements dm_env.Environment.step."""
    step_response = self._connection.send(
        dm_env_rpc_pb2.StepRequest(
            requested_observations=self._requested_observation_uids,
            actions=self._action_specs.pack(actions)))

    observations = self._observation_specs.unpack(step_response.observations)

    if (step_response.state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING and
        self._last_state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING):
      step_type = dm_env.StepType.MID
    elif step_response.state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING:
      step_type = dm_env.StepType.FIRST
    elif self._last_state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING:
      step_type = dm_env.StepType.LAST
    else:
      raise RuntimeError('Environment transitioned from {} to {}'.format(
          self._last_state, step_response.state))

    self._last_state = step_response.state

    reward = self.reward(
        state=step_response.state,
        step_type=step_type,
        observations=observations)
    discount = self.discount(
        state=step_response.state,
        step_type=step_type,
        observations=observations)
    if not self._is_reward_requested:
      observations.pop(DEFAULT_REWARD_KEY, None)
    if not self._is_discount_requested:
      observations.pop(DEFAULT_DISCOUNT_KEY, None)
    return dm_env.TimeStep(step_type, reward, discount, observations)
Exemplo n.º 12
0
    def step(
            self,
            nn_actions: types.NestedArray) -> Tuple[dm_env.TimeStep, np.array]:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        actions = self._proc_robocup_actions(nn_actions)
        raw_obs, rewards, state, done = self._environment.step(actions)
        self._reset_next_step = done

        proc_obs = self._proc_robocup_obs(observations=raw_obs,
                                          done=done,
                                          nn_actions=nn_actions)
        proccessed_state = self._proc_robocup_state(state, proc_obs)

        if done:
            self._step_type = dm_env.StepType.LAST
        else:
            self._step_type = dm_env.StepType.MID

        return (
            dm_env.TimeStep(
                observation=proc_obs,
                reward=rewards,
                discount=self._discount,
                step_type=self._step_type,
            ),
            {
                "env_state": proccessed_state
            },
        )
Exemplo n.º 13
0
    def step(self, action):
        """Implementation of dm_env.step that supports repeated actions."""

        discount = None
        reward = None
        self._events = []
        for _ in range(self._num_action_repeats):
            next_timestep = super().step(action)

            # Accumulate reward per timestep.
            if next_timestep.reward is not None:
                reward = (reward or 0.) + next_timestep.reward

            # Calculate the product for discount.
            if next_timestep.discount is not None:
                discount = discount if discount else []
                discount.append(next_timestep.discount)

            timestep = dm_env.TimeStep(
                next_timestep.step_type,
                reward,
                # Note: np.product(None) returns None.
                np.product(discount),
                next_timestep.observation)
            self._events.extend([
                _unpack_world_event(event)
                for event in timestep.observation['events']
            ])

            if timestep.last():
                return timestep

        return timestep
Exemplo n.º 14
0
    def test_transitions_returned_if_episode_length_less_than_n(self):
        f = dm_env.StepType.FIRST
        m = dm_env.StepType.MID
        l = dm_env.StepType.LAST

        n = 4
        accumulator = replay_lib.NStepTransitionAccumulator(n)
        step_types = [f, m, l]
        num_timesteps = len(step_types)
        states = list(range(num_timesteps))
        discounts = np.ones(num_timesteps)
        rewards = np.ones(num_timesteps)
        actions = np.ones(num_timesteps)

        accumulator_output = []
        for i in range(num_timesteps):
            timestep = dm_env.TimeStep(step_type=step_types[i],
                                       observation=states[i],
                                       discount=discounts[i],
                                       reward=rewards[i])
            accumulator_output.append(
                list(accumulator.step(timestep, actions[i])))
        output_lengths = [len(output) for output in accumulator_output]
        output_states = [[(tr.s_tm1, tr.s_t) for tr in output]
                         for output in accumulator_output]

        # Expect a 1-step transition and a 2-step transition after LAST timestep.
        expected_output_lengths = [0, 0, 2]
        expected_output_states = [[], [], [(0, 2), (1, 2)]]
        self.assertEqual(expected_output_lengths, output_lengths)
        self.assertEqual(expected_output_states, output_states)
Exemplo n.º 15
0
    def step(self, action: types.NestedArray) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        open_spiel_timestep = self._environment.step(action)

        if open_spiel_timestep.step_type == rl_environment.StepType.LAST:
            self._reset_next_step = True

        observations = self._convert_observation(open_spiel_timestep)
        rewards = np.asarray(open_spiel_timestep.rewards)
        discounts = np.asarray(open_spiel_timestep.discounts)
        step_type = open_spiel_timestep.step_type

        if step_type == rl_environment.StepType.FIRST:
            step_type = dm_env.StepType.FIRST
        elif step_type == rl_environment.StepType.MID:
            step_type = dm_env.StepType.MID
        elif step_type == rl_environment.StepType.LAST:
            step_type = dm_env.StepType.LAST
        else:
            raise ValueError(
                "Did not recognize OpenSpiel StepType: {}".format(step_type))

        return dm_env.TimeStep(observation=observations,
                               reward=rewards,
                               discount=discounts,
                               step_type=step_type)
Exemplo n.º 16
0
    def step(self, action: Union[float, int,
                                 types.NestedArray]) -> dm_env.TimeStep:
        # Return a reset timestep if we haven't touched the environment yet.
        if not self._step:
            return self.reset()

        _validate_spec(self._spec.actions, action)

        observation = self._generate_fake_observation()
        reward = self._generate_fake_reward()
        discount = self._generate_fake_discount()

        self.agent_step_counter += 1

        if self._episode_length and (self._step == self._episode_length):
            # Only reset step once all all agents have taken their turn.
            if self.agent_step_counter == len(self.agents):
                self._step = 0
                self.agent_step_counter = 0

            # We can't use dm_env.termination directly because then the discount
            # wouldn't necessarily conform to the spec (if eg. we want float32).
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            # Only update step counter once all agents have taken their turn.
            if self.agent_step_counter == len(self.agents):
                self._step += 1
                self.agent_step_counter = 0

            return dm_env.transition(reward=reward,
                                     observation=observation,
                                     discount=discount)
Exemplo n.º 17
0
    def step(
        self, actions: Dict[str, Union[float, int, types.NestedArray]]
    ) -> dm_env.TimeStep:

        # Return a reset timestep if we haven't touched the environment yet.
        if not self._step:
            return self.reset()

        for agent, action in actions.items():
            _validate_spec(self._specs[agent].actions, action)

        observation = {
            agent: self._generate_fake_observation()
            for agent in self.agents
        }
        reward = {agent: self._generate_fake_reward() for agent in self.agents}
        discount = {
            agent: self._generate_fake_discount()
            for agent in self.agents
        }

        if self._episode_length and (self._step == self._episode_length):
            self._step = 0
            # We can't use dm_env.termination directly because then the discount
            # wouldn't necessarily conform to the spec (if eg. we want float32).
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            self._step += 1
            return dm_env.transition(reward=reward,
                                     observation=observation,
                                     discount=discount)
Exemplo n.º 18
0
 def _send_observation(self, timestep: dm_env.TimeStep, player: int):
   # If terminal all actors must update
   if player == pyspiel.PlayerId.TERMINAL:
     for player_id in range(len(self._actors)):
       # Note: we must account for situations where the first observation
       # is a terminal state, e.g. if an opponent folds in poker before we get
       # to act.
       if self._observed_first[player_id]:
         player_timestep = self._get_player_timestep(timestep, player_id)
         self._actors[player_id].observe(self._prev_actions[player_id],
                                         player_timestep)
         if self._should_update:
           self._actors[player_id].update()
     self._observed_first = [False] * len(self._actors)
     self._prev_actions = [pyspiel.INVALID_ACTION] * len(self._actors)
   else:
     if not self._observed_first[player]:
       player_timestep = dm_env.TimeStep(
           observation=timestep.observation[player],
           reward=None,
           discount=None,
           step_type=dm_env.StepType.FIRST)
       self._actors[player].observe_first(player_timestep)
       self._observed_first[player] = True
     else:
       player_timestep = self._get_player_timestep(timestep, player)
       self._actors[player].observe(self._prev_actions[player],
                                    player_timestep)
       if self._should_update:
         self._actors[player].update()
Exemplo n.º 19
0
    def step(self, actions: Dict[str, np.array]) -> Tuple[dm_env.TimeStep, np.array]:
        """Steps the environment."""
        if self._reset_next_step:
            self._reset_next_step = False
            self.reset()

        observations, rewards, dones, state_infos = self._environment.step(actions)
        if observations:
            observations = self._convert_observations(observations, dones)

        if self._environment.env_done:
            self._step_type = dm_env.StepType.LAST
            self._reset_next_step = True
        else:
            self._step_type = dm_env.StepType.MID

        return (
            dm_env.TimeStep(
                observation=observations,
                reward=rewards,
                discount=self._discounts,
                step_type=self._step_type,
            ),
            state_infos,
        )
Exemplo n.º 20
0
    def step(self, action):
        """Implementation of dm_env.step that supports repeated actions."""

        timestep = None
        discount = None
        reward = None
        for _ in range(self._num_action_repeats):
            next_timestep = super(_HardEightTasksEnv, self).step(action)

            # Accumulate reward per timestep.
            if next_timestep.reward is not None:
                reward = (reward or 0.) + next_timestep.reward

            # Calculate the product for discount.
            if next_timestep.discount is not None:
                discount = discount if discount else []
                discount.append(next_timestep.discount)

            timestep = dm_env.TimeStep(
                next_timestep.step_type,
                reward,
                # Note: np.product(None) returns None.
                np.product(discount),
                next_timestep.observation)

            if timestep.last():
                return timestep

        return timestep
 def step(self, _):
     self.time_step += 1
     if self.time_step == self.trial_start:
         self.configure_trial()
     self.tick()
     return dm_env.TimeStep(dm_env.StepType.MID, self.cumulant(),
                            self.gamma, self.observation())
Exemplo n.º 22
0
 def _add_reward_noise(self, timestep: dm_env.TimeStep):
     if timestep.first():
         return timestep
     reward = timestep.reward + self._noise_scale * self._rng.randn()
     return dm_env.TimeStep(step_type=timestep.step_type,
                            reward=reward,
                            discount=timestep.discount,
                            observation=timestep.observation)
Exemplo n.º 23
0
 def _rescale_rewards(self, timestep: dm_env.TimeStep):
     if timestep.first():
         return timestep
     reward = timestep.reward * self._reward_scale
     return dm_env.TimeStep(step_type=timestep.step_type,
                            reward=reward,
                            discount=timestep.discount,
                            observation=timestep.observation)
Exemplo n.º 24
0
    def step(self, action: int) -> dm_env.TimeStep:
        """Steps up to action_repeat times and returns a post-processed step."""
        if self._reset_next_step:
            return self.reset()

        timestep_stack = []

        # Step on environment multiple times for each selected action.
        for _ in range(self._action_repeats):
            timestep = self._environment.step([np.array([action])])

            self._episode_len += 1
            if self._episode_len == self._max_episode_len:
                timestep = timestep._replace(step_type=dm_env.StepType.LAST)

            timestep_stack.append(timestep)

            if timestep.last():
                # Action repeat frames should not span episode boundaries. Also, no need
                # to pad with zero-valued observations as all the reductions in
                # _postprocess_observation work gracefully for any non-zero size of
                # timestep_stack.
                self._reset_next_step = True
                break

        # Determine a single step type. We let FIRST take priority over LAST, since
        # we think it's more likely algorithm code will be set up to deal with that,
        # due to environments supporting reset() (which emits a FIRST).
        # Note we'll never have LAST then FIRST in timestep_stack here.
        step_type = dm_env.StepType.MID
        for timestep in timestep_stack:
            if timestep.first():
                step_type = dm_env.StepType.FIRST
                break
            elif timestep.last():
                step_type = dm_env.StepType.LAST
                break

        if timestep_stack[0].first():
            # Update first timestep to have identity effect on reward and discount.
            timestep_stack[0] = timestep_stack[0]._replace(reward=0.,
                                                           discount=1.)

        # Sum reward over stack.
        reward = sum(timestep_t.reward for timestep_t in timestep_stack)

        # Multiply discount over stack (will either be 0. or 1.).
        discount = np.product(
            [timestep_t.discount for timestep_t in timestep_stack])

        observation = self._observation_from_timestep_stack(timestep_stack)

        timestep = dm_env.TimeStep(step_type=step_type,
                                   reward=reward,
                                   observation=observation,
                                   discount=discount)

        return self._postprocess_observation(timestep)
Exemplo n.º 25
0
 def observe_first(self, timestep: dm_env.TimeStep):
     # Create a new timestep with the latent variable in the observation
     new_timestep = dm_env.TimeStep(
         step_type=timestep.step_type,
         reward=timestep.reward,
         discount=timestep.discount,
         observation=self._concat_latent_variable(timestep.observation),
     )
     return self._agent._actor.observe_first(new_timestep)
Exemplo n.º 26
0
 def test_reset(self):
     self.accumulator.reset()
     transitions = self.accumulator.step(timestep_t=dm_env.TimeStep(
         step_type=dm_env.StepType.FIRST,
         observation=-1,
         discount=1.,
         reward=3),
                                         a_t=1)
     self.assertEqual([], list(transitions))
Exemplo n.º 27
0
 def _set_step_type(
     self, timestep: dm_env.TimeStep, step_type: dm_env.StepType
 ) -> dm_env.TimeStep:
     return dm_env.TimeStep(
         observation=timestep.observation,
         reward=timestep.reward,
         discount=timestep.discount,
         step_type=step_type,
     )
Exemplo n.º 28
0
    def run(self, num_steps):
        """Perform the run loop.

    Args:
      num_steps: number of steps to run the loop for.
    """
        current_steps = 0
        while current_steps < num_steps:

            # Reset any counts and start the environment.
            start_time = time.time()
            self._rewarder.reset()

            episode_steps = 0
            episode_return = 0
            episode_imitation_return = 0
            timestep = self._environment.reset()

            self._actor.observe_first(timestep)

            # Run an episode.
            while not timestep.last():
                action = self._actor.select_action(timestep.observation)
                obs_act = {
                    'observation': timestep.observation,
                    'action': action
                }
                imitation_reward = self._rewarder.compute_reward(obs_act)
                timestep = self._environment.step(action)
                imitation_timestep = dm_env.TimeStep(
                    step_type=timestep.step_type,
                    reward=imitation_reward,
                    discount=timestep.discount,
                    observation=timestep.observation)

                self._actor.observe(action, next_timestep=imitation_timestep)
                self._actor.update()

                # Book-keeping.
                episode_steps += 1
                episode_return += timestep.reward
                episode_imitation_return += imitation_reward

            # Collect the results and combine with counts.
            counts = self._counter.increment(episodes=1, steps=episode_steps)
            steps_per_second = episode_steps / (time.time() - start_time)
            result = {
                'episode_length': episode_steps,
                'episode_return': episode_return,
                'episode_return_imitation': episode_imitation_return,
                'steps_per_second': steps_per_second,
            }
            result.update(counts)

            self._logger.write(result)
            current_steps += episode_steps
Exemplo n.º 29
0
def parameterized_restart(
    reward: types.Reward,
    discount: types.Discount,
    observation: types.Observation,
) -> dm_env.TimeStep:
    """Returns a `TimeStep` with `step_type` set to `StepType.FIRST`.
    Differs from dm_env.restart, since reward and discount can be set to
    initial types."""
    return dm_env.TimeStep(dm_env.StepType.FIRST, reward, discount,
                           observation)
Exemplo n.º 30
0
 def observe(self, action: types.NestedArray,
             next_timestep: dm_env.TimeStep):
     # Create a new timestep with the latent variable in the observation
     new_timestep = dm_env.TimeStep(
         step_type=next_timestep.step_type,
         reward=next_timestep.reward,
         discount=next_timestep.discount,
         observation=self._concat_latent_variable(
             next_timestep.observation),
     )
     return self._agent._actor.observe(action, new_timestep)