예제 #1
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._goals_achieved.append(info['goal_achieved'])
        success = self._environment.evaluate_success([{
            'env_infos': {
                'goal_achieved': self._goals_achieved
            }
        }])
        info['success'] = bool(success)
        if self._end_on_success:
            done = done or success
        for k in info:
            assert k in self._info_defaults.keys() | {'TimeLimit.truncated'}
        observation = self._wrap_observation(observation, info)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
예제 #2
0
파일: cartpole.py 프로젝트: ziyibaby/bsuite
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        self._state = step_cartpole(
            action=action,
            timescale=self._timescale,
            state=self._state,
            config=self._cartpole_config,
        )

        # Rewards only when the pole is central and balanced
        is_reward = (np.cos(self._state.theta) > self._height_threshold
                     and np.abs(self._state.x) < self._x_threshold)
        reward = 1. if is_reward else 0.
        self._raw_return += reward
        self._episode_return += reward
        self._best_episode = max(self._episode_return, self._best_episode)

        if self._state.time_elapsed > self._max_time or not is_reward:
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self.observation)
        else:  # continuing transition.
            return dm_env.transition(reward=reward,
                                     observation=self.observation)
 def test_transition(self, observation, reward, discount):
   time_step = dm_env.transition(
       reward=reward, observation=observation, discount=discount)
   self.assertIs(dm_env.StepType.MID, time_step.step_type)
   self.assertEqual(observation, time_step.observation)
   self.assertEqual(reward, time_step.reward)
   self.assertEqual(discount, time_step.discount)
    def step(self, action: int) -> dm_env.TimeStep:
        dm_env_step = self.dm_env.step(action)

        #hack set reward as 0 if dm_env_step.reward returns None which happens in case of restart()
        self._raw_return += 0. if dm_env_step.reward is None else dm_env_step.reward
        self._episode_return += 0. if dm_env_step.reward is None else dm_env_step.reward

        if self.gym_env.total_transitions_episode > self.max_episode_len:
            self._best_episode = max(self._episode_return, self._best_episode)
            dm_env_step = dm_env.truncation(dm_env_step.reward,
                                            dm_env_step.observation)

        ohe_obs = np.zeros(
            shape=(self.gym_env.observation_space.n, ), dtype=np.float32
        )  #hack #TODO bsuite/baselines/tf/dqn agent doesn't allow discrete states
        ohe_obs[dm_env_step.observation] = 1
        # dm_env_step.observation = ohe_obs

        # return corresponding TimeStep object based on step_type
        if dm_env_step.step_type == StepType.FIRST:
            return dm_env.restart(ohe_obs)
        elif dm_env_step.step_type == StepType.LAST:
            return dm_env.termination(dm_env_step.reward, ohe_obs)
        else:
            return dm_env.transition(dm_env_step.reward, ohe_obs)
예제 #5
0
파일: deep_sea.py 프로젝트: tbz233/bsuite
    def _step(self, action):
        reward = 0.
        action_right = action == self._action_mapping[self._row, self._column]

        # Reward calculation
        if self._column == self._size - 1 and action_right:
            reward += 1.
            self._denoised_return += 1.
        if not self._deterministic:  # Noisy rewards on the 'end' of chain.
            if self._row == self._size - 1 and self._column in [
                    0, self._size - 1
            ]:
                reward += self._rng.randn()

        # Transition dynamics
        if action_right:
            if self._rng.rand() > 1 / self._size or self._deterministic:
                self._column = np.clip(self._column + 1, 0, self._size - 1)
            reward -= self._unscaled_move_cost / self._size
        else:
            if self._row == self._column:  # You were on the right path and went wrong
                self._bad_episode = True
            self._column = np.clip(self._column - 1, 0, self._size - 1)
        self._row += 1

        observation = self._get_observation()
        if self._row == self._size:
            if self._bad_episode:
                self._total_bad_episodes += 1
            return dm_env.termination(reward=reward, observation=observation)
        else:
            return dm_env.transition(reward=reward, observation=observation)
예제 #6
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        self._state = cartpole.step_cartpole(
            action=action,
            timescale=self._timescale,
            state=self._state,
            config=self._cartpole_config,
        )

        # Rewards only when the pole is central and balanced
        is_upright = (np.cos(self._state.theta) > self._height_threshold and
                      np.abs(self._state.theta_dot) < self._theta_dot_threshold
                      and np.abs(self._state.x) < self._x_reward_threshold)
        reward = -1. * np.abs(action - 1) * self._move_cost
        self._steps_elapsed += 1
        if is_upright:
            reward += 1.
        self._raw_return += reward
        self._episode_return += reward
        self._best_episode = max(self._episode_return, self._best_episode)

        #is_end_of_episode = (self._state.time_elapsed > self._max_time
        is_end_of_episode = (self._steps_elapsed > self._max_steps
                             or np.abs(self._state.x) > self._x_threshold)
        if is_end_of_episode:
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self.observation)
        else:  # continuing transition.
            return dm_env.transition(reward=reward,
                                     observation=self.observation)
예제 #7
0
    def _step(self, action: int) -> dm_env.TimeStep:
        self._timestep += 1

        ## update agent
        agent = self.locate("P")
        reward = 0.0
        vector = Actions(action).vector()
        location = (
            max(0, min(agent[0] + vector[0], self.shape[0])),
            max(0, min(agent[1] + vector[1], self.shape[1])),
        )
        # hit a wall, go back (diagonal moves are never done partially)
        if self.art[location] == "#":
            location = agent
        # stepped on object, compute reward
        if self.art[location] in [obj.symbol for obj in self.objects]:
            obj = [x for x in self.objects if x.symbol == self.art[location]]
            reward = obj[0].reward if len(obj) > 0 else 0.0
        # set new agent position
        self.art[agent] = " "
        self.art[location] = "P"

        ## update environment, let it be ❤
        for obj in self.objects:
            missing = obj.n - len(self.locate(obj.symbol))
            for _ in range(missing):
                #  termination probability
                if self._rng.random() < obj.eps_term:
                    return dm_env.termination(reward, self._get_observation())
                #  respawning probability
                if self._rng.random() < obj.eps_respawn:
                    self.spawn(obj.symbol)
        return dm_env.transition(reward, self._get_observation())
예제 #8
0
    def step(self, action):
        """Updates the environment according to the action."""

        if self._reset_next_step:
            return self.reset()

        # Insert token if column isn't full if column is full
        if self._col_heights[action] < N_HEIGHT:
            target_cell = action * N_HEIGHT + self._col_heights[action]
            target_player = 0 if self._player_one_turn else 1
            self._board[target_player] |= 1 << target_cell
            self._col_heights[action] += 1
        else:
            print("Illegal move!")

        self._player_one_turn = not self._player_one_turn

        # Check for termination.
        if self.is_terminal():
            reward = 1.0 if self._winner == 0 else -1.0 if self._winner == 1 else 0.0
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self._observation())
        else:
            return dm_env.transition(reward=0.0,
                                     observation=self._observation())
예제 #9
0
    def step(self, action):
        """Step the environment with an action."""
        if self._reset_next_step:
            return self.reset()

        # Apply the game_rules
        for rule in self.game_rules:
            rule.step(self._state, self._meta_state)

        # Apply the action
        self.action_space.step(self._state, action)

        # Step the physics
        self.physics.step(self._state)

        # Compute reward
        self.step_count += 1
        reward, should_reset = self.task.reward(self._state, self._meta_state,
                                                self.step_count)

        # Take observation
        observation = self.observation()

        # Return transition
        if should_reset:
            self._reset_next_step = True
            return dm_env.termination(reward=reward, observation=observation)
        else:
            return dm_env.transition(reward=reward, observation=observation)
예제 #10
0
파일: mocks.py 프로젝트: NetColby/DNRL
    def step(
        self, actions: Dict[str, Union[float, int, types.NestedArray]]
    ) -> dm_env.TimeStep:

        # Return a reset timestep if we haven't touched the environment yet.
        if not self._step:
            return self.reset()

        for agent, action in actions.items():
            _validate_spec(self._specs[agent].actions, action)

        observation = {
            agent: self._generate_fake_observation()
            for agent in self.agents
        }
        reward = {agent: self._generate_fake_reward() for agent in self.agents}
        discount = {
            agent: self._generate_fake_discount()
            for agent in self.agents
        }

        if self._episode_length and (self._step == self._episode_length):
            self._step = 0
            # We can't use dm_env.termination directly because then the discount
            # wouldn't necessarily conform to the spec (if eg. we want float32).
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            self._step += 1
            return dm_env.transition(reward=reward,
                                     observation=observation,
                                     discount=discount)
예제 #11
0
파일: mocks.py 프로젝트: NetColby/DNRL
    def step(self, action: Union[float, int,
                                 types.NestedArray]) -> dm_env.TimeStep:
        # Return a reset timestep if we haven't touched the environment yet.
        if not self._step:
            return self.reset()

        _validate_spec(self._spec.actions, action)

        observation = self._generate_fake_observation()
        reward = self._generate_fake_reward()
        discount = self._generate_fake_discount()

        self.agent_step_counter += 1

        if self._episode_length and (self._step == self._episode_length):
            # Only reset step once all all agents have taken their turn.
            if self.agent_step_counter == len(self.agents):
                self._step = 0
                self.agent_step_counter = 0

            # We can't use dm_env.termination directly because then the discount
            # wouldn't necessarily conform to the spec (if eg. we want float32).
            return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount,
                                   observation)
        else:
            # Only update step counter once all agents have taken their turn.
            if self.agent_step_counter == len(self.agents):
                self._step += 1
                self.agent_step_counter = 0

            return dm_env.transition(reward=reward,
                                     observation=observation,
                                     discount=discount)
예제 #12
0
def make_timestep_from_step_type_string(step_type_str, observation):
    if step_type_str == 'f':
        return dm_env.restart(observation=observation)
    elif step_type_str == 'm':
        return dm_env.transition(reward=0, observation=observation)
    elif step_type_str == 'l':
        return dm_env.termination(reward=0, observation=observation)
    else:
        raise ValueError('Unknown step type string %s.' % step_type_str)
예제 #13
0
파일: gym_wrapper.py 프로젝트: neale/bsuite
    def step(self, action: int) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        # Convert the gym step result to a dm_env TimeStep.
        obs, reward, done, _ = self.gym_env.step(action)

        if done:
            self._reset_next_step = True
            return dm_env.termination(reward, obs)
        else:
            return dm_env.transition(reward, obs)
예제 #14
0
  def step(self, action: np.ndarray) -> dm_env.TimeStep:
    # Reset if previous timestep was LAST.
    if self._reset_next_step:
      return self.reset()

    # Take an environment step.
    observation, reward, done = self._environment.step(action)
    self._reset_next_step = done

    if done:
      return dm_env.termination(reward=reward, observation=observation)  # After this, it's always LAST
    return dm_env.transition(reward=reward, observation=observation)
예제 #15
0
    def test_wrapper(self):
        """Tests that the wrapper computes and logs the correct data."""
        mock_logger = mock.MagicMock()
        mock_logger.write = mock.MagicMock()

        # Make a fake environment that cycles through these time steps.
        timesteps = [
            dm_env.restart([]),
            dm_env.transition(1, []),
            dm_env.transition(2, []),
            dm_env.termination(3, []),
        ]
        expected_episode_return = 6
        fake_env = FakeEnvironment(timesteps)
        env = wrappers.Logging(env=fake_env,
                               logger=mock_logger,
                               log_every=True)

        num_episodes = 5

        for _ in range(num_episodes):
            timestep = env.reset()
            while not timestep.last():
                timestep = env.step(action=0)

        # We count the number of transitions, hence the -1.
        expected_episode_length = len(timesteps) - 1

        expected_calls = []
        for i in range(1, num_episodes + 1):
            expected_calls.append(
                mock.call(
                    dict(
                        steps=expected_episode_length * i,
                        episode=i,
                        total_return=expected_episode_return * i,
                        episode_len=expected_episode_length,
                        episode_return=expected_episode_return,
                    )))
        mock_logger.write.assert_has_calls(expected_calls)
예제 #16
0
    def step(self, action: List[np.ndarray]) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, _ = self._environment.step(action[0].item())
        self._reset_next_step = done

        observation = self._wrap_observation(observation)

        if done:
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
예제 #17
0
    def step(self, action: types.NestedArray) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._reset_next_step = all(done.values()) if isinstance(done, dict) else done == True

        if self._reset_next_step:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
예제 #18
0
    def step(self, action):
        """Performs an environment step."""
        # If the environment has just been created or finished an episode
        # we should reset it (ignoring the action).
        if self._prev_step_type in {None, environment.StepType.LAST}:
            return self.reset()

        for k in action.keys():
            self._action_spec[k].validate(action[k])

        locations, flag, pressure, log_size, red, green, blue = (
            self._process_action(action))
        loc_control, loc_end = locations

        # Perform action.
        self._surface.BeginAtomic()

        if flag == 1:  # The agent produces a visible stroke.
            self._action_mask = self._action_masks["paint"]
            y_c, x_c = loc_control
            y_e, x_e = loc_end
            self._bezier_to(y_c, x_c, y_e, x_e, pressure, log_size, red, green,
                            blue)

            # Update episode statistics.
            self.stats["total_strokes"] += 1
            if not self._prev_brush_params["is_painting"]:
                self.stats["total_disjoint"] += 1
        elif flag == 0:  # The agent moves to a new location.
            self._action_mask = self._action_masks["move"]
            y_e, x_e = loc_end
            self._move_to(y_e, x_e)
        else:
            raise ValueError("Invalid flag value")

        self._surface.EndAtomic()

        # Handle termination of the episode.
        reward = 0
        self._episode_step += 1
        if self._episode_step == self._episode_length:
            time_step = environment.termination(reward=reward,
                                                observation=self.observation())
        else:
            time_step = environment.transition(reward=reward,
                                               observation=self.observation(),
                                               discount=self._discount)

        self._prev_step_type = time_step.step_type

        return time_step
예제 #19
0
    def step(self, action: types.NestedArray) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
예제 #20
0
  def test_buffer(self):
    # Given a buffer and some dummy data...
    max_sequence_length = 10
    obs_shape = (3, 3)
    buffer = sequence.Buffer(
        obs_spec=specs.Array(obs_shape, dtype=np.float),
        action_spec=specs.Array((), dtype=np.int),
        max_sequence_length=max_sequence_length)
    dummy_step = dm_env.transition(observation=np.zeros(obs_shape), reward=0.)

    # If we add `max_sequence_length` items to the buffer...
    for _ in range(max_sequence_length):
      buffer.append(dummy_step, 0, dummy_step)

    # Then the buffer should now be full.
    self.assertTrue(buffer.full())

    # Any further appends should throw an error.
    with self.assertRaises(ValueError):
      buffer.append(dummy_step, 0, dummy_step)

    # If we now drain this trajectory from the buffer...
    trajectory = buffer.drain()

    # The `observations` sequence should have length `T + 1`.
    self.assertLen(trajectory.observations, max_sequence_length + 1)

    # All other sequences should have length `T`.
    self.assertLen(trajectory.actions, max_sequence_length)
    self.assertLen(trajectory.rewards, max_sequence_length)
    self.assertLen(trajectory.discounts, max_sequence_length)

    # The buffer should now be empty.
    self.assertTrue(buffer.empty())

    # A second call to drain() should throw an error, since the buffer is empty.
    with self.assertRaises(ValueError):
      buffer.drain()

    # If we now append another transition...
    buffer.append(dummy_step, 0, dummy_step)

    # And immediately drain the buffer...
    trajectory = buffer.drain()

    # We should have a valid partial trajectory of length T=1.
    self.assertLen(trajectory.observations, 2)
    self.assertLen(trajectory.actions, 1)
    self.assertLen(trajectory.rewards, 1)
    self.assertLen(trajectory.discounts, 1)
예제 #21
0
    def _step(self, action: int) -> dm_env.TimeStep:
        if self._timestep == 0:
            self._context = action

        self._timestep += 1
        if self._timestep == self._reward_timestep[self._context]:
            reward = self._rewards[self._context]
        else:
            reward = 0.

        observation = self._get_observation()
        if self._timestep == self._episode_len:
            return dm_env.termination(reward=reward, observation=observation)
        return dm_env.transition(reward=reward, observation=observation)
예제 #22
0
    def step(self, action: np.ndarray):
        """Updates the environment according to the action."""
        if self._reset_next_step:
            return self.reset()

        self.defended = np.logical_or(self.defended, action)

        self.burn_vertices()

        if self._reset_next_step:
            return dm_env.termination(reward=0.0,
                                      observation=self._observation())

        return dm_env.transition(reward=-1.0, observation=self._observation())
예제 #23
0
def fake_demonstration_iterator():
  k = 0
  while True:
    action = np.random.uniform(low=0., high=1., size=3).astype(np.float32)
    obs = np.random.uniform(low=0., high=1., size=5).astype(np.float32)
    reward = np.float32(0.)
    discount = np.float32(0.)
    if k % 10 == 0:
      ts = dm_env.restart(obs)
    elif k % 10 == 9:
      ts = dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, obs)
    else:
      ts = dm_env.transition(reward=reward, observation=obs, discount=discount)
    k += 1
    yield action, ts
예제 #24
0
  def step(self, action):
    """Step the environment with an action."""
    if self._reset_next_step:
      return self.reset()

    self._read_action(self._action_spec, action)
    self._env.act_discrete(self._act_discrete)
    self._env.act_continuous(self._act_continuous)
    self._env.act_text(self._act_text)
    self._status, reward = self._env.advance()
    if self._status != dmlab2d.RUNNING:
      self._reset_next_step = True
      return dm_env.termination(reward=reward, observation=self.observation())
    else:
      return dm_env.transition(reward=reward, observation=self.observation())
예제 #25
0
파일: memory_len.py 프로젝트: ybwsfl/bsuite
    def _step(self, action):
        observation = self._get_observation()
        self._timestep += 1

        # on all but the last step provide a reward of 0.
        if self._timestep - 1 < self._memory_length:
            return dm_env.transition(reward=0., observation=observation)

        elif self._timestep - 1 == self._memory_length:
            if action == self._context[self._query]:
                reward = 1.
                self._total_perfect += 1
            else:
                reward = -1.
                self._total_regret += 2.
            return dm_env.termination(reward=reward, observation=observation)
예제 #26
0
    def step(self, action: int) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        # Convert the gym step result to a dm_env TimeStep.
        observation, reward, done, info = self.gym_env.step(action)
        self._reset_next_step = done

        if done:
            is_truncated = info.get('TimeLimit.truncated', False)
            if is_truncated:
                return dm_env.truncation(reward, observation)
            else:
                return dm_env.termination(reward, observation)
        else:
            return dm_env.transition(reward, observation)
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, terminal, _ = self._env.step(action.item())
        observation = observation.squeeze(-1)
        discount = 1 - float(terminal)
        self._episode_steps += 1
        if terminal:
            self._reset_next_episode = True
            return dm_env.termination(reward, observation)
        elif self._episode_steps == self._max_episode_steps:
            self._reset_next_episode = True
            return dm_env.truncation(reward, observation, discount)
        else:
            return dm_env.transition(reward, observation, discount)
예제 #28
0
파일: test_utils.py 프로젝트: weileze/acme
def make_trajectory(observations):
  """Make a simple trajectory from a sequence of observations.

  Arguments:
    observations: a sequence of observations.

  Returns:
    a tuple (first, steps) where first contains the initial dm_env.TimeStep
    object and steps contains a list of (action, step) tuples. The length of
    steps is given by episode_length.
  """
  first = dm_env.restart(observations[0])
  middle = [(0, dm_env.transition(reward=0.0, observation=observation))
            for observation in observations[1:-1]]
  last = (0, dm_env.termination(reward=0.0, observation=observations[-1]))
  return first, middle + [last]
예제 #29
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        for k in info:
            assert k in self._info_defaults.keys() | {'TimeLimit.truncated'}
        observation = self._wrap_observation(observation, info)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
예제 #30
0
    def _step(self, action: int) -> dm_env.TimeStep:
        observation = self._get_observation()
        self._timestep += 1

        if self._timestep - 1 < self._memory_length:
            # On all but the last step provide a reward of 0.
            return dm_env.transition(reward=0., observation=observation)
        if self._timestep - 1 > self._memory_length:
            raise RuntimeError('Invalid state.')  # We shouldn't get here.

        if action == self._context[self._query]:
            reward = 1.
            self._total_perfect += 1
        else:
            reward = -1.
            self._total_regret += 2.
        return dm_env.termination(reward=reward, observation=observation)