def step(self, action): if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._goals_achieved.append(info['goal_achieved']) success = self._environment.evaluate_success([{ 'env_infos': { 'goal_achieved': self._goals_achieved } }]) info['success'] = bool(success) if self._end_on_success: done = done or success for k in info: assert k in self._info_defaults.keys() | {'TimeLimit.truncated'} observation = self._wrap_observation(observation, info) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action): if self._reset_next_step: return self.reset() self._state = step_cartpole( action=action, timescale=self._timescale, state=self._state, config=self._cartpole_config, ) # Rewards only when the pole is central and balanced is_reward = (np.cos(self._state.theta) > self._height_threshold and np.abs(self._state.x) < self._x_threshold) reward = 1. if is_reward else 0. self._raw_return += reward self._episode_return += reward self._best_episode = max(self._episode_return, self._best_episode) if self._state.time_elapsed > self._max_time or not is_reward: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation) else: # continuing transition. return dm_env.transition(reward=reward, observation=self.observation)
def test_transition(self, observation, reward, discount): time_step = dm_env.transition( reward=reward, observation=observation, discount=discount) self.assertIs(dm_env.StepType.MID, time_step.step_type) self.assertEqual(observation, time_step.observation) self.assertEqual(reward, time_step.reward) self.assertEqual(discount, time_step.discount)
def step(self, action: int) -> dm_env.TimeStep: dm_env_step = self.dm_env.step(action) #hack set reward as 0 if dm_env_step.reward returns None which happens in case of restart() self._raw_return += 0. if dm_env_step.reward is None else dm_env_step.reward self._episode_return += 0. if dm_env_step.reward is None else dm_env_step.reward if self.gym_env.total_transitions_episode > self.max_episode_len: self._best_episode = max(self._episode_return, self._best_episode) dm_env_step = dm_env.truncation(dm_env_step.reward, dm_env_step.observation) ohe_obs = np.zeros( shape=(self.gym_env.observation_space.n, ), dtype=np.float32 ) #hack #TODO bsuite/baselines/tf/dqn agent doesn't allow discrete states ohe_obs[dm_env_step.observation] = 1 # dm_env_step.observation = ohe_obs # return corresponding TimeStep object based on step_type if dm_env_step.step_type == StepType.FIRST: return dm_env.restart(ohe_obs) elif dm_env_step.step_type == StepType.LAST: return dm_env.termination(dm_env_step.reward, ohe_obs) else: return dm_env.transition(dm_env_step.reward, ohe_obs)
def _step(self, action): reward = 0. action_right = action == self._action_mapping[self._row, self._column] # Reward calculation if self._column == self._size - 1 and action_right: reward += 1. self._denoised_return += 1. if not self._deterministic: # Noisy rewards on the 'end' of chain. if self._row == self._size - 1 and self._column in [ 0, self._size - 1 ]: reward += self._rng.randn() # Transition dynamics if action_right: if self._rng.rand() > 1 / self._size or self._deterministic: self._column = np.clip(self._column + 1, 0, self._size - 1) reward -= self._unscaled_move_cost / self._size else: if self._row == self._column: # You were on the right path and went wrong self._bad_episode = True self._column = np.clip(self._column - 1, 0, self._size - 1) self._row += 1 observation = self._get_observation() if self._row == self._size: if self._bad_episode: self._total_bad_episodes += 1 return dm_env.termination(reward=reward, observation=observation) else: return dm_env.transition(reward=reward, observation=observation)
def step(self, action): if self._reset_next_step: return self.reset() self._state = cartpole.step_cartpole( action=action, timescale=self._timescale, state=self._state, config=self._cartpole_config, ) # Rewards only when the pole is central and balanced is_upright = (np.cos(self._state.theta) > self._height_threshold and np.abs(self._state.theta_dot) < self._theta_dot_threshold and np.abs(self._state.x) < self._x_reward_threshold) reward = -1. * np.abs(action - 1) * self._move_cost self._steps_elapsed += 1 if is_upright: reward += 1. self._raw_return += reward self._episode_return += reward self._best_episode = max(self._episode_return, self._best_episode) #is_end_of_episode = (self._state.time_elapsed > self._max_time is_end_of_episode = (self._steps_elapsed > self._max_steps or np.abs(self._state.x) > self._x_threshold) if is_end_of_episode: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation) else: # continuing transition. return dm_env.transition(reward=reward, observation=self.observation)
def _step(self, action: int) -> dm_env.TimeStep: self._timestep += 1 ## update agent agent = self.locate("P") reward = 0.0 vector = Actions(action).vector() location = ( max(0, min(agent[0] + vector[0], self.shape[0])), max(0, min(agent[1] + vector[1], self.shape[1])), ) # hit a wall, go back (diagonal moves are never done partially) if self.art[location] == "#": location = agent # stepped on object, compute reward if self.art[location] in [obj.symbol for obj in self.objects]: obj = [x for x in self.objects if x.symbol == self.art[location]] reward = obj[0].reward if len(obj) > 0 else 0.0 # set new agent position self.art[agent] = " " self.art[location] = "P" ## update environment, let it be ❤ for obj in self.objects: missing = obj.n - len(self.locate(obj.symbol)) for _ in range(missing): # termination probability if self._rng.random() < obj.eps_term: return dm_env.termination(reward, self._get_observation()) # respawning probability if self._rng.random() < obj.eps_respawn: self.spawn(obj.symbol) return dm_env.transition(reward, self._get_observation())
def step(self, action): """Updates the environment according to the action.""" if self._reset_next_step: return self.reset() # Insert token if column isn't full if column is full if self._col_heights[action] < N_HEIGHT: target_cell = action * N_HEIGHT + self._col_heights[action] target_player = 0 if self._player_one_turn else 1 self._board[target_player] |= 1 << target_cell self._col_heights[action] += 1 else: print("Illegal move!") self._player_one_turn = not self._player_one_turn # Check for termination. if self.is_terminal(): reward = 1.0 if self._winner == 0 else -1.0 if self._winner == 1 else 0.0 self._reset_next_step = True return dm_env.termination(reward=reward, observation=self._observation()) else: return dm_env.transition(reward=0.0, observation=self._observation())
def step(self, action): """Step the environment with an action.""" if self._reset_next_step: return self.reset() # Apply the game_rules for rule in self.game_rules: rule.step(self._state, self._meta_state) # Apply the action self.action_space.step(self._state, action) # Step the physics self.physics.step(self._state) # Compute reward self.step_count += 1 reward, should_reset = self.task.reward(self._state, self._meta_state, self.step_count) # Take observation observation = self.observation() # Return transition if should_reset: self._reset_next_step = True return dm_env.termination(reward=reward, observation=observation) else: return dm_env.transition(reward=reward, observation=observation)
def step( self, actions: Dict[str, Union[float, int, types.NestedArray]] ) -> dm_env.TimeStep: # Return a reset timestep if we haven't touched the environment yet. if not self._step: return self.reset() for agent, action in actions.items(): _validate_spec(self._specs[agent].actions, action) observation = { agent: self._generate_fake_observation() for agent in self.agents } reward = {agent: self._generate_fake_reward() for agent in self.agents} discount = { agent: self._generate_fake_discount() for agent in self.agents } if self._episode_length and (self._step == self._episode_length): self._step = 0 # We can't use dm_env.termination directly because then the discount # wouldn't necessarily conform to the spec (if eg. we want float32). return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: self._step += 1 return dm_env.transition(reward=reward, observation=observation, discount=discount)
def step(self, action: Union[float, int, types.NestedArray]) -> dm_env.TimeStep: # Return a reset timestep if we haven't touched the environment yet. if not self._step: return self.reset() _validate_spec(self._spec.actions, action) observation = self._generate_fake_observation() reward = self._generate_fake_reward() discount = self._generate_fake_discount() self.agent_step_counter += 1 if self._episode_length and (self._step == self._episode_length): # Only reset step once all all agents have taken their turn. if self.agent_step_counter == len(self.agents): self._step = 0 self.agent_step_counter = 0 # We can't use dm_env.termination directly because then the discount # wouldn't necessarily conform to the spec (if eg. we want float32). return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: # Only update step counter once all agents have taken their turn. if self.agent_step_counter == len(self.agents): self._step += 1 self.agent_step_counter = 0 return dm_env.transition(reward=reward, observation=observation, discount=discount)
def make_timestep_from_step_type_string(step_type_str, observation): if step_type_str == 'f': return dm_env.restart(observation=observation) elif step_type_str == 'm': return dm_env.transition(reward=0, observation=observation) elif step_type_str == 'l': return dm_env.termination(reward=0, observation=observation) else: raise ValueError('Unknown step type string %s.' % step_type_str)
def step(self, action: int) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() # Convert the gym step result to a dm_env TimeStep. obs, reward, done, _ = self.gym_env.step(action) if done: self._reset_next_step = True return dm_env.termination(reward, obs) else: return dm_env.transition(reward, obs)
def step(self, action: np.ndarray) -> dm_env.TimeStep: # Reset if previous timestep was LAST. if self._reset_next_step: return self.reset() # Take an environment step. observation, reward, done = self._environment.step(action) self._reset_next_step = done if done: return dm_env.termination(reward=reward, observation=observation) # After this, it's always LAST return dm_env.transition(reward=reward, observation=observation)
def test_wrapper(self): """Tests that the wrapper computes and logs the correct data.""" mock_logger = mock.MagicMock() mock_logger.write = mock.MagicMock() # Make a fake environment that cycles through these time steps. timesteps = [ dm_env.restart([]), dm_env.transition(1, []), dm_env.transition(2, []), dm_env.termination(3, []), ] expected_episode_return = 6 fake_env = FakeEnvironment(timesteps) env = wrappers.Logging(env=fake_env, logger=mock_logger, log_every=True) num_episodes = 5 for _ in range(num_episodes): timestep = env.reset() while not timestep.last(): timestep = env.step(action=0) # We count the number of transitions, hence the -1. expected_episode_length = len(timesteps) - 1 expected_calls = [] for i in range(1, num_episodes + 1): expected_calls.append( mock.call( dict( steps=expected_episode_length * i, episode=i, total_return=expected_episode_return * i, episode_len=expected_episode_length, episode_return=expected_episode_return, ))) mock_logger.write.assert_has_calls(expected_calls)
def step(self, action: List[np.ndarray]) -> dm_env.TimeStep: """Steps the environment.""" if self._reset_next_step: return self.reset() observation, reward, done, _ = self._environment.step(action[0].item()) self._reset_next_step = done observation = self._wrap_observation(observation) if done: return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action: types.NestedArray) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._reset_next_step = all(done.values()) if isinstance(done, dict) else done == True if self._reset_next_step: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action): """Performs an environment step.""" # If the environment has just been created or finished an episode # we should reset it (ignoring the action). if self._prev_step_type in {None, environment.StepType.LAST}: return self.reset() for k in action.keys(): self._action_spec[k].validate(action[k]) locations, flag, pressure, log_size, red, green, blue = ( self._process_action(action)) loc_control, loc_end = locations # Perform action. self._surface.BeginAtomic() if flag == 1: # The agent produces a visible stroke. self._action_mask = self._action_masks["paint"] y_c, x_c = loc_control y_e, x_e = loc_end self._bezier_to(y_c, x_c, y_e, x_e, pressure, log_size, red, green, blue) # Update episode statistics. self.stats["total_strokes"] += 1 if not self._prev_brush_params["is_painting"]: self.stats["total_disjoint"] += 1 elif flag == 0: # The agent moves to a new location. self._action_mask = self._action_masks["move"] y_e, x_e = loc_end self._move_to(y_e, x_e) else: raise ValueError("Invalid flag value") self._surface.EndAtomic() # Handle termination of the episode. reward = 0 self._episode_step += 1 if self._episode_step == self._episode_length: time_step = environment.termination(reward=reward, observation=self.observation()) else: time_step = environment.transition(reward=reward, observation=self.observation(), discount=self._discount) self._prev_step_type = time_step.step_type return time_step
def step(self, action: types.NestedArray) -> dm_env.TimeStep: """Steps the environment.""" if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def test_buffer(self): # Given a buffer and some dummy data... max_sequence_length = 10 obs_shape = (3, 3) buffer = sequence.Buffer( obs_spec=specs.Array(obs_shape, dtype=np.float), action_spec=specs.Array((), dtype=np.int), max_sequence_length=max_sequence_length) dummy_step = dm_env.transition(observation=np.zeros(obs_shape), reward=0.) # If we add `max_sequence_length` items to the buffer... for _ in range(max_sequence_length): buffer.append(dummy_step, 0, dummy_step) # Then the buffer should now be full. self.assertTrue(buffer.full()) # Any further appends should throw an error. with self.assertRaises(ValueError): buffer.append(dummy_step, 0, dummy_step) # If we now drain this trajectory from the buffer... trajectory = buffer.drain() # The `observations` sequence should have length `T + 1`. self.assertLen(trajectory.observations, max_sequence_length + 1) # All other sequences should have length `T`. self.assertLen(trajectory.actions, max_sequence_length) self.assertLen(trajectory.rewards, max_sequence_length) self.assertLen(trajectory.discounts, max_sequence_length) # The buffer should now be empty. self.assertTrue(buffer.empty()) # A second call to drain() should throw an error, since the buffer is empty. with self.assertRaises(ValueError): buffer.drain() # If we now append another transition... buffer.append(dummy_step, 0, dummy_step) # And immediately drain the buffer... trajectory = buffer.drain() # We should have a valid partial trajectory of length T=1. self.assertLen(trajectory.observations, 2) self.assertLen(trajectory.actions, 1) self.assertLen(trajectory.rewards, 1) self.assertLen(trajectory.discounts, 1)
def _step(self, action: int) -> dm_env.TimeStep: if self._timestep == 0: self._context = action self._timestep += 1 if self._timestep == self._reward_timestep[self._context]: reward = self._rewards[self._context] else: reward = 0. observation = self._get_observation() if self._timestep == self._episode_len: return dm_env.termination(reward=reward, observation=observation) return dm_env.transition(reward=reward, observation=observation)
def step(self, action: np.ndarray): """Updates the environment according to the action.""" if self._reset_next_step: return self.reset() self.defended = np.logical_or(self.defended, action) self.burn_vertices() if self._reset_next_step: return dm_env.termination(reward=0.0, observation=self._observation()) return dm_env.transition(reward=-1.0, observation=self._observation())
def fake_demonstration_iterator(): k = 0 while True: action = np.random.uniform(low=0., high=1., size=3).astype(np.float32) obs = np.random.uniform(low=0., high=1., size=5).astype(np.float32) reward = np.float32(0.) discount = np.float32(0.) if k % 10 == 0: ts = dm_env.restart(obs) elif k % 10 == 9: ts = dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, obs) else: ts = dm_env.transition(reward=reward, observation=obs, discount=discount) k += 1 yield action, ts
def step(self, action): """Step the environment with an action.""" if self._reset_next_step: return self.reset() self._read_action(self._action_spec, action) self._env.act_discrete(self._act_discrete) self._env.act_continuous(self._act_continuous) self._env.act_text(self._act_text) self._status, reward = self._env.advance() if self._status != dmlab2d.RUNNING: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation()) else: return dm_env.transition(reward=reward, observation=self.observation())
def _step(self, action): observation = self._get_observation() self._timestep += 1 # on all but the last step provide a reward of 0. if self._timestep - 1 < self._memory_length: return dm_env.transition(reward=0., observation=observation) elif self._timestep - 1 == self._memory_length: if action == self._context[self._query]: reward = 1. self._total_perfect += 1 else: reward = -1. self._total_regret += 2. return dm_env.termination(reward=reward, observation=observation)
def step(self, action: int) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() # Convert the gym step result to a dm_env TimeStep. observation, reward, done, info = self.gym_env.step(action) self._reset_next_step = done if done: is_truncated = info.get('TimeLimit.truncated', False) if is_truncated: return dm_env.truncation(reward, observation) else: return dm_env.termination(reward, observation) else: return dm_env.transition(reward, observation)
def step(self, action): if self._reset_next_step: return self.reset() observation, reward, terminal, _ = self._env.step(action.item()) observation = observation.squeeze(-1) discount = 1 - float(terminal) self._episode_steps += 1 if terminal: self._reset_next_episode = True return dm_env.termination(reward, observation) elif self._episode_steps == self._max_episode_steps: self._reset_next_episode = True return dm_env.truncation(reward, observation, discount) else: return dm_env.transition(reward, observation, discount)
def make_trajectory(observations): """Make a simple trajectory from a sequence of observations. Arguments: observations: a sequence of observations. Returns: a tuple (first, steps) where first contains the initial dm_env.TimeStep object and steps contains a list of (action, step) tuples. The length of steps is given by episode_length. """ first = dm_env.restart(observations[0]) middle = [(0, dm_env.transition(reward=0.0, observation=observation)) for observation in observations[1:-1]] last = (0, dm_env.termination(reward=0.0, observation=observations[-1])) return first, middle + [last]
def step(self, action): if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) for k in info: assert k in self._info_defaults.keys() | {'TimeLimit.truncated'} observation = self._wrap_observation(observation, info) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def _step(self, action: int) -> dm_env.TimeStep: observation = self._get_observation() self._timestep += 1 if self._timestep - 1 < self._memory_length: # On all but the last step provide a reward of 0. return dm_env.transition(reward=0., observation=observation) if self._timestep - 1 > self._memory_length: raise RuntimeError('Invalid state.') # We shouldn't get here. if action == self._context[self._query]: reward = 1. self._total_perfect += 1 else: reward = -1. self._total_regret += 2. return dm_env.termination(reward=reward, observation=observation)