Exemplo n.º 1
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._goals_achieved.append(info['goal_achieved'])
        success = self._environment.evaluate_success([{
            'env_infos': {
                'goal_achieved': self._goals_achieved
            }
        }])
        info['success'] = bool(success)
        if self._end_on_success:
            done = done or success
        for k in info:
            assert k in self._info_defaults.keys() | {'TimeLimit.truncated'}
        observation = self._wrap_observation(observation, info)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
Exemplo n.º 2
0
    def _step(self, action: int) -> dm_env.TimeStep:
        self._timestep += 1

        ## update agent
        agent = self.locate("P")
        reward = 0.0
        vector = Actions(action).vector()
        location = (
            max(0, min(agent[0] + vector[0], self.shape[0])),
            max(0, min(agent[1] + vector[1], self.shape[1])),
        )
        # hit a wall, go back (diagonal moves are never done partially)
        if self.art[location] == "#":
            location = agent
        # stepped on object, compute reward
        if self.art[location] in [obj.symbol for obj in self.objects]:
            obj = [x for x in self.objects if x.symbol == self.art[location]]
            reward = obj[0].reward if len(obj) > 0 else 0.0
        # set new agent position
        self.art[agent] = " "
        self.art[location] = "P"

        ## update environment, let it be ❤
        for obj in self.objects:
            missing = obj.n - len(self.locate(obj.symbol))
            for _ in range(missing):
                #  termination probability
                if self._rng.random() < obj.eps_term:
                    return dm_env.termination(reward, self._get_observation())
                #  respawning probability
                if self._rng.random() < obj.eps_respawn:
                    self.spawn(obj.symbol)
        return dm_env.transition(reward, self._get_observation())
Exemplo n.º 3
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        self._state = cartpole.step_cartpole(
            action=action,
            timescale=self._timescale,
            state=self._state,
            config=self._cartpole_config,
        )

        # Rewards only when the pole is central and balanced
        is_upright = (np.cos(self._state.theta) > self._height_threshold and
                      np.abs(self._state.theta_dot) < self._theta_dot_threshold
                      and np.abs(self._state.x) < self._x_reward_threshold)
        reward = -1. * np.abs(action - 1) * self._move_cost
        self._steps_elapsed += 1
        if is_upright:
            reward += 1.
        self._raw_return += reward
        self._episode_return += reward
        self._best_episode = max(self._episode_return, self._best_episode)

        #is_end_of_episode = (self._state.time_elapsed > self._max_time
        is_end_of_episode = (self._steps_elapsed > self._max_steps
                             or np.abs(self._state.x) > self._x_threshold)
        if is_end_of_episode:
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self.observation)
        else:  # continuing transition.
            return dm_env.transition(reward=reward,
                                     observation=self.observation)
Exemplo n.º 4
0
    def _step(self, action):
        reward = 0.
        action_right = action == self._action_mapping[self._row, self._column]

        # Reward calculation
        if self._column == self._size - 1 and action_right:
            reward += 1.
            self._denoised_return += 1.
        if not self._deterministic:  # Noisy rewards on the 'end' of chain.
            if self._row == self._size - 1 and self._column in [
                    0, self._size - 1
            ]:
                reward += self._rng.randn()

        # Transition dynamics
        if action_right:
            if self._rng.rand() > 1 / self._size or self._deterministic:
                self._column = np.clip(self._column + 1, 0, self._size - 1)
            reward -= self._unscaled_move_cost / self._size
        else:
            if self._row == self._column:  # You were on the right path and went wrong
                self._bad_episode = True
            self._column = np.clip(self._column - 1, 0, self._size - 1)
        self._row += 1

        observation = self._get_observation()
        if self._row == self._size:
            if self._bad_episode:
                self._total_bad_episodes += 1
            return dm_env.termination(reward=reward, observation=observation)
        else:
            return dm_env.transition(reward=reward, observation=observation)
Exemplo n.º 5
0
 def _step(self, action: int) -> dm_env.TimeStep:
     """+1/-1 for correct/incorrect guesses. This also terminates the episode."""
     correct = action == self._correct_label
     reward = 1. if correct else -1.
     self._total_regret += self._optimal_return - reward
     observation = np.zeros(shape=self._image_shape, dtype=np.float32)
     return dm_env.termination(reward=reward, observation=observation)
Exemplo n.º 6
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        self._state = step_cartpole(
            action=action,
            timescale=self._timescale,
            state=self._state,
            config=self._cartpole_config,
        )

        # Rewards only when the pole is central and balanced
        is_reward = (np.cos(self._state.theta) > self._height_threshold
                     and np.abs(self._state.x) < self._x_threshold)
        reward = 1. if is_reward else 0.
        self._raw_return += reward
        self._episode_return += reward
        self._best_episode = max(self._episode_return, self._best_episode)

        if self._state.time_elapsed > self._max_time or not is_reward:
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self.observation)
        else:  # continuing transition.
            return dm_env.transition(reward=reward,
                                     observation=self.observation)
Exemplo n.º 7
0
    def step(self, action):
        """Updates the environment according to the action."""

        if self._reset_next_step:
            return self.reset()

        # Insert token if column isn't full if column is full
        if self._col_heights[action] < N_HEIGHT:
            target_cell = action * N_HEIGHT + self._col_heights[action]
            target_player = 0 if self._player_one_turn else 1
            self._board[target_player] |= 1 << target_cell
            self._col_heights[action] += 1
        else:
            print("Illegal move!")

        self._player_one_turn = not self._player_one_turn

        # Check for termination.
        if self.is_terminal():
            reward = 1.0 if self._winner == 0 else -1.0 if self._winner == 1 else 0.0
            self._reset_next_step = True
            return dm_env.termination(reward=reward,
                                      observation=self._observation())
        else:
            return dm_env.transition(reward=0.0,
                                     observation=self._observation())
Exemplo n.º 8
0
    def step(self, action):
        """Step the environment with an action."""
        if self._reset_next_step:
            return self.reset()

        # Apply the game_rules
        for rule in self.game_rules:
            rule.step(self._state, self._meta_state)

        # Apply the action
        self.action_space.step(self._state, action)

        # Step the physics
        self.physics.step(self._state)

        # Compute reward
        self.step_count += 1
        reward, should_reset = self.task.reward(self._state, self._meta_state,
                                                self.step_count)

        # Take observation
        observation = self.observation()

        # Return transition
        if should_reset:
            self._reset_next_step = True
            return dm_env.termination(reward=reward, observation=observation)
        else:
            return dm_env.transition(reward=reward, observation=observation)
    def step(self, action: int) -> dm_env.TimeStep:
        dm_env_step = self.dm_env.step(action)

        #hack set reward as 0 if dm_env_step.reward returns None which happens in case of restart()
        self._raw_return += 0. if dm_env_step.reward is None else dm_env_step.reward
        self._episode_return += 0. if dm_env_step.reward is None else dm_env_step.reward

        if self.gym_env.total_transitions_episode > self.max_episode_len:
            self._best_episode = max(self._episode_return, self._best_episode)
            dm_env_step = dm_env.truncation(dm_env_step.reward,
                                            dm_env_step.observation)

        ohe_obs = np.zeros(
            shape=(self.gym_env.observation_space.n, ), dtype=np.float32
        )  #hack #TODO bsuite/baselines/tf/dqn agent doesn't allow discrete states
        ohe_obs[dm_env_step.observation] = 1
        # dm_env_step.observation = ohe_obs

        # return corresponding TimeStep object based on step_type
        if dm_env_step.step_type == StepType.FIRST:
            return dm_env.restart(ohe_obs)
        elif dm_env_step.step_type == StepType.LAST:
            return dm_env.termination(dm_env_step.reward, ohe_obs)
        else:
            return dm_env.transition(dm_env_step.reward, ohe_obs)
Exemplo n.º 10
0
def make_timestep_from_step_type_string(step_type_str, observation):
    if step_type_str == 'f':
        return dm_env.restart(observation=observation)
    elif step_type_str == 'm':
        return dm_env.transition(reward=0, observation=observation)
    elif step_type_str == 'l':
        return dm_env.termination(reward=0, observation=observation)
    else:
        raise ValueError('Unknown step type string %s.' % step_type_str)
Exemplo n.º 11
0
    def _step(self, action: int) -> dm_env.TimeStep:
        self._timestep += 1

        ## update agent
        reward = 0.0
        vector = Actions(action).vector()
        location = (
            max(0, min(self._agent_location[0] + vector[0], self.shape[0])),
            max(0, min(self._agent_location[1] + vector[1], self.shape[1])),
        )
        # hit a wall, go back (diagonal moves are never done partially)
        if self.art[location] == "#":
            location = self._agent_location

        # stepped on object, compute reward
        if self.art[location] in [obj.symbol for obj in self.objects]:
            obj = [x for x in self.objects if x.symbol == self.art[location]]
            if len(obj) > 0:
                reward = obj[0].reward
                #  termination probability
                if self._rng.random() < obj[0].eps_term:
                    return dm_env.termination(reward, self._get_observation())

        # set new agent position
        self.art[self._agent_location] = " "
        self.art[location] = "P"
        self._agent_location = location

        ## update environment, let it be ❤
        for obj in self.objects:
            for i, location in enumerate(self._object_locations[obj.symbol]):
                # if location
                if self.art[location] != obj.symbol:
                    #  respawning probability
                    if self._rng.random() < obj.eps_respawn:
                        self._object_locations[obj.symbol][i] = self.spawn(
                            obj.symbol, location)

        # 增加到最大步数时结束
        if self._timestep == self.max_steps:
            return dm_env.termination(reward, self._get_observation())

        return dm_env.transition(reward, self._get_observation())
Exemplo n.º 12
0
 def _step(self, action: np.ndarray) -> dm_env.TimeStep:
     """Does one step within TCV."""
     voltages = self._noise.add_action_noise(action)
     voltage_simulator = self._simulator_voltages_from_voltages(voltages)
     try:
         state = self._simulator.step(voltage_simulator)
     except (fge_state.InvalidSolutionError, fge_state.StopSignalException):
         return dm_env.termination(self._reward.terminal_reward(),
                                   self._last_observation)
     references = self._reference_generator.step()
     self._last_observation = self._extract_observation(
         state, references, action)
     term = self._termination.terminate(state)
     if term:
         return dm_env.termination(self._reward.terminal_reward(),
                                   self._last_observation)
     reward, _ = self._reward.reward(voltages, state, references)
     self._step_counter += 1
     if self._step_counter >= self._max_episode_length:
         return dm_env.truncation(reward, self._last_observation)
     return dm_env.transition(reward, self._last_observation)
Exemplo n.º 13
0
    def step(self, action: int) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        # Convert the gym step result to a dm_env TimeStep.
        obs, reward, done, _ = self.gym_env.step(action)

        if done:
            self._reset_next_step = True
            return dm_env.termination(reward, obs)
        else:
            return dm_env.transition(reward, obs)
Exemplo n.º 14
0
  def step(self, action: np.ndarray) -> dm_env.TimeStep:
    # Reset if previous timestep was LAST.
    if self._reset_next_step:
      return self.reset()

    # Take an environment step.
    observation, reward, done = self._environment.step(action)
    self._reset_next_step = done

    if done:
      return dm_env.termination(reward=reward, observation=observation)  # After this, it's always LAST
    return dm_env.transition(reward=reward, observation=observation)
Exemplo n.º 15
0
    def step(self, action: types.NestedArray) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._reset_next_step = all(done.values()) if isinstance(done, dict) else done == True

        if self._reset_next_step:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
Exemplo n.º 16
0
    def step(self, action: List[np.ndarray]) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, _ = self._environment.step(action[0].item())
        self._reset_next_step = done

        observation = self._wrap_observation(observation)

        if done:
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
Exemplo n.º 17
0
    def step(self, action):
        """Performs an environment step."""
        # If the environment has just been created or finished an episode
        # we should reset it (ignoring the action).
        if self._prev_step_type in {None, environment.StepType.LAST}:
            return self.reset()

        for k in action.keys():
            self._action_spec[k].validate(action[k])

        locations, flag, pressure, log_size, red, green, blue = (
            self._process_action(action))
        loc_control, loc_end = locations

        # Perform action.
        self._surface.BeginAtomic()

        if flag == 1:  # The agent produces a visible stroke.
            self._action_mask = self._action_masks["paint"]
            y_c, x_c = loc_control
            y_e, x_e = loc_end
            self._bezier_to(y_c, x_c, y_e, x_e, pressure, log_size, red, green,
                            blue)

            # Update episode statistics.
            self.stats["total_strokes"] += 1
            if not self._prev_brush_params["is_painting"]:
                self.stats["total_disjoint"] += 1
        elif flag == 0:  # The agent moves to a new location.
            self._action_mask = self._action_masks["move"]
            y_e, x_e = loc_end
            self._move_to(y_e, x_e)
        else:
            raise ValueError("Invalid flag value")

        self._surface.EndAtomic()

        # Handle termination of the episode.
        reward = 0
        self._episode_step += 1
        if self._episode_step == self._episode_length:
            time_step = environment.termination(reward=reward,
                                                observation=self.observation())
        else:
            time_step = environment.transition(reward=reward,
                                               observation=self.observation(),
                                               discount=self._discount)

        self._prev_step_type = time_step.step_type

        return time_step
Exemplo n.º 18
0
    def _step(self, action: int) -> dm_env.TimeStep:
        if self._timestep == 0:
            self._context = action

        self._timestep += 1
        if self._timestep == self._reward_timestep[self._context]:
            reward = self._rewards[self._context]
        else:
            reward = 0.

        observation = self._get_observation()
        if self._timestep == self._episode_len:
            return dm_env.termination(reward=reward, observation=observation)
        return dm_env.transition(reward=reward, observation=observation)
Exemplo n.º 19
0
    def reset(self) -> dm_env.TimeStep:
        """
        Reset the environment and start a new episode.
        """
        observation = self._sim.reset(self._sim_input, False)

        # Necessary when willingness is set to 0
        # and Rainbow agents are disabled
        if self.sim_finished():
            self._start_of_episode = True
            return dm_env.termination(None, observation)

        self._start_of_episode = False
        return dm_env.restart(observation)
Exemplo n.º 20
0
    def step(self, action: np.ndarray):
        """Updates the environment according to the action."""
        if self._reset_next_step:
            return self.reset()

        self.defended = np.logical_or(self.defended, action)

        self.burn_vertices()

        if self._reset_next_step:
            return dm_env.termination(reward=0.0,
                                      observation=self._observation())

        return dm_env.transition(reward=-1.0, observation=self._observation())
Exemplo n.º 21
0
    def step(self, action: types.NestedArray) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
Exemplo n.º 22
0
  def step(self, action):
    """Step the environment with an action."""
    if self._reset_next_step:
      return self.reset()

    self._read_action(self._action_spec, action)
    self._env.act_discrete(self._act_discrete)
    self._env.act_continuous(self._act_continuous)
    self._env.act_text(self._act_text)
    self._status, reward = self._env.advance()
    if self._status != dmlab2d.RUNNING:
      self._reset_next_step = True
      return dm_env.termination(reward=reward, observation=self.observation())
    else:
      return dm_env.transition(reward=reward, observation=self.observation())
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, terminal, _ = self._env.step(action.item())
        observation = observation.squeeze(-1)
        discount = 1 - float(terminal)
        self._episode_steps += 1
        if terminal:
            self._reset_next_episode = True
            return dm_env.termination(reward, observation)
        elif self._episode_steps == self._max_episode_steps:
            self._reset_next_episode = True
            return dm_env.truncation(reward, observation, discount)
        else:
            return dm_env.transition(reward, observation, discount)
Exemplo n.º 24
0
    def step(self, action):
        if self._reset_next_step:
            return self.reset()

        observation, reward, done, info = self._environment.step(action)
        for k in info:
            assert k in self._info_defaults.keys() | {'TimeLimit.truncated'}
        observation = self._wrap_observation(observation, info)
        self._reset_next_step = done

        if done:
            truncated = info.get('TimeLimit.truncated', False)
            if truncated:
                return dm_env.truncation(reward, observation)
            return dm_env.termination(reward, observation)
        return dm_env.transition(reward, observation)
Exemplo n.º 25
0
def make_trajectory(observations):
  """Make a simple trajectory from a sequence of observations.

  Arguments:
    observations: a sequence of observations.

  Returns:
    a tuple (first, steps) where first contains the initial dm_env.TimeStep
    object and steps contains a list of (action, step) tuples. The length of
    steps is given by episode_length.
  """
  first = dm_env.restart(observations[0])
  middle = [(0, dm_env.transition(reward=0.0, observation=observation))
            for observation in observations[1:-1]]
  last = (0, dm_env.termination(reward=0.0, observation=observations[-1]))
  return first, middle + [last]
Exemplo n.º 26
0
    def _step(self, action):
        observation = self._get_observation()
        self._timestep += 1

        # on all but the last step provide a reward of 0.
        if self._timestep - 1 < self._memory_length:
            return dm_env.transition(reward=0., observation=observation)

        elif self._timestep - 1 == self._memory_length:
            if action == self._context[self._query]:
                reward = 1.
                self._total_perfect += 1
            else:
                reward = -1.
                self._total_regret += 2.
            return dm_env.termination(reward=reward, observation=observation)
Exemplo n.º 27
0
    def step(self, action: int) -> dm_env.TimeStep:
        if self._reset_next_step:
            return self.reset()

        # Convert the gym step result to a dm_env TimeStep.
        observation, reward, done, info = self.gym_env.step(action)
        self._reset_next_step = done

        if done:
            is_truncated = info.get('TimeLimit.truncated', False)
            if is_truncated:
                return dm_env.truncation(reward, observation)
            else:
                return dm_env.termination(reward, observation)
        else:
            return dm_env.transition(reward, observation)
Exemplo n.º 28
0
    def _step(self, action: int) -> dm_env.TimeStep:
        observation = self._get_observation()
        self._timestep += 1

        if self._timestep - 1 < self._memory_length:
            # On all but the last step provide a reward of 0.
            return dm_env.transition(reward=0., observation=observation)
        if self._timestep - 1 > self._memory_length:
            raise RuntimeError('Invalid state.')  # We shouldn't get here.

        if action == self._context[self._query]:
            reward = 1.
            self._total_perfect += 1
        else:
            reward = -1.
            self._total_regret += 2.
        return dm_env.termination(reward=reward, observation=observation)
Exemplo n.º 29
0
    def step(self):
        """Step the environment, returning an observation."""
        if self._reset_next_step:
            return self.reset()

        self._step_count += 1

        for _ in range(self._physics_steps_per_env_step):
            self.physics_step()

        observation = self.observation()

        if self.should_terminate():
            self._reset_next_step = True
            return dm_env.termination(reward=0, observation=observation)
        else:
            return dm_env.transition(reward=0, observation=observation)
Exemplo n.º 30
0
    def step(self, action):
        if self._needs_reset:
            return self.reset()

        lab_action = np.empty(self._action_count, dtype=np.dtype("int32"))
        for name, value in six.iteritems(action):
            lab_action[self._action_map[name]] = value

        reward = self._lab.step(lab_action)

        if self._lab.is_running():
            return dm_env.transition(reward=reward,
                                     observation=self._observation())
        else:
            self._needs_reset = True
            return dm_env.termination(reward=reward,
                                      observation=self._observation())