def step(self, action): """Updates the environment using the action and returns a `TimeStep`.""" if self._reset_next_step: return self.reset() self._task.before_step(action, self._physics) for _ in range(self._n_sub_steps): self._physics.step() self._task.after_step(self._physics) reward = self._task.get_reward(self._physics) observation = self._task.get_observation(self._physics) if self._flat_observation: observation = flatten_observation(observation) self._step_count += 1 if self._step_count >= self._step_limit: discount = 1.0 else: discount = self._task.get_termination(self._physics) episode_over = discount is not None if episode_over: self._reset_next_step = True return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: return dm_env.TimeStep(dm_env.StepType.MID, reward, 1.0, observation)
def initial_step(self, action): if self._reset_next_step: return self.reset() for _ in range(self._n_sub_steps * self._n_frame_skip): self._physics.step() self._task.after_step(self._physics) reward = self._task.get_reward(self._physics) observation = self._task.get_observation(self._physics) if self._flat_observation: observation = flatten_observation(observation) if self._step_count >= self._step_limit: discount = 1.0 else: discount = self._task.get_termination(self._physics) episode_over = discount is not None if episode_over: self._reset_next_step = True return dm_env.TimeStep( dm_env.StepType.LAST, reward, discount, observation, dict() ) else: return dm_env.TimeStep( dm_env.StepType.MID, reward, 1.0, observation, dict() )
def step(self, action): """Updates the environment using the action and returns a `TimeStep`.""" if self._reset_next_step: return self.reset() self._task.before_step(action, self._physics) for _ in range(self._n_sub_steps): self._physics.step() self._task.after_step(self._physics) reward = self._task.get_reward(self._physics) observation = self._task.get_observation(self._physics) if self._flat_observation: observation = flatten_observation(observation) # Added Code for Elias Viewing k = self._step_count self._physics.named.data.qpos['slider_1'] = self.data[k][0][0] self._physics.named.data.qpos['hinge_1_1'] = self.data[k][0][1] self._physics.named.data.qpos['slider_2'] = self.data[k][1][0] self._physics.named.data.qpos['hinge_1_2'] = self.data[k][1][1] self._physics.named.data.qpos['slider_3'] = self.data[k][2][0] self._physics.named.data.qpos['hinge_1_3'] = self.data[k][2][1] self._physics.named.data.qpos['slider_4'] = self.data[k][3][0] self._physics.named.data.qpos['hinge_1_4'] = self.data[k][3][1] self._physics.named.data.qpos['slider_5'] = self.data[k][4][0] self._physics.named.data.qpos['hinge_1_5'] = self.data[k][4][1] self._physics.named.data.qpos['slider_6'] = self.data[k][5][0] self._physics.named.data.qpos['hinge_1_6'] = self.data[k][5][1] self._physics.named.data.qpos['slider_7'] = self.data[k][6][0] self._physics.named.data.qpos['hinge_1_7'] = self.data[k][6][1] self._physics.named.data.qpos['slider_8'] = self.data[k][6][0] self._physics.named.data.qpos['hinge_1_8'] = self.data[k][6][1] self._step_count += 1 if self._step_count >= self._step_limit: discount = 1.0 else: discount = self._task.get_termination(self._physics) episode_over = discount is not None if episode_over: self._reset_next_step = True return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: return dm_env.TimeStep(dm_env.StepType.MID, reward, 1.0, observation)
def _prefill_with_demonstrations(adder: adders.Adder, demonstrations: Sequence[types.Transition], reward: Optional[float], min_num_transitions: int = 0) -> None: """Fill the adder's replay buffer with expert transitions. Assumes that the demonstrations dataset stores transitions in order. Args: adder: the agent which adds the demonstrations. demonstrations: the expert demonstrations to iterate over. reward: if non-None, populates the environment reward entry of transitions. min_num_transitions: the lower bound on transitions processed, the dataset will be iterated over multiple times if needed. Once at least min_num_transitions are added, the processing is interrupted at the nearest episode end. """ if not demonstrations: return reward = np.float32(reward) if reward is not None else reward remaining_transitions = min_num_transitions step_type = None action = None ts = dm_env.TimeStep(None, None, None, None) # Unused. while remaining_transitions > 0: # In case we share the adder or demonstrations don't end with # end-of-episode, reset the adder prior to add_first. adder.reset() for transition_num, transition in enumerate(demonstrations): remaining_transitions -= 1 discount = np.float32(1.0) ts_reward = reward if reward is not None else transition.reward if step_type == dm_env.StepType.LAST or transition_num == 0: ts = dm_env.TimeStep(dm_env.StepType.FIRST, ts_reward, discount, transition.observation) adder.add_first(ts) observation = transition.next_observation action = transition.action if transition.discount == 0. or transition_num == len( demonstrations) - 1: step_type = dm_env.StepType.LAST discount = np.float32(0.0) else: step_type = dm_env.StepType.MID ts = dm_env.TimeStep(step_type, ts_reward, discount, observation) adder.add(action, ts) if remaining_transitions <= 0: # Note: we could check `step_type == dm_env.StepType.LAST` to stop at an # episode end if possible. break # Explicitly finalize the Reverb client writes. adder.reset()
def step(self, action): """Updates the environment using the action and returns a `TimeStep`.""" if self._reset_next_step: self._reset_next_step = False return self.reset() self._hooks.before_step(self._physics_proxy, action, self._random_state) self._observation_updater.prepare_for_next_control_step() try: for i in range(self._n_sub_steps): self._hooks.before_substep(self._physics_proxy, action, self._random_state) self._physics.step() self._hooks.after_substep(self._physics_proxy, self._random_state) # The final observation update must happen after all the hooks in # `self._hooks.after_step` is called. Otherwise, if any of these hooks # modify the physics state then we might capture an observation that is # inconsistent with the final physics state. if i < self._n_sub_steps - 1: self._observation_updater.update() physics_is_divergent = False except control.PhysicsError as e: if not self._raise_exception_on_physics_error: logging.warning(e) physics_is_divergent = True else: raise self._hooks.after_step(self._physics_proxy, self._random_state) self._observation_updater.update() if not physics_is_divergent: reward = self._task.get_reward(self._physics_proxy) discount = self._task.get_discount(self._physics_proxy) terminating = (self._task.should_terminate_episode( self._physics_proxy) or self._physics.time() >= self._time_limit) else: reward = 0.0 discount = 0.0 terminating = True obs = self._observation_updater.get_observation() if not terminating: return dm_env.TimeStep(dm_env.StepType.MID, reward, discount, obs) else: self._reset_next_step = True return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, obs)
def test_dqn(): n_actions = 10 in_shape = (4, 84, 84) hparams = HParams() agent = DQN(n_actions, in_shape, hparams) r = 0 x = jax.random.normal(agent.rng, (84, 84, 3)) timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r, agent.hparams.discount, x) action = agent.select_action(timestep) y = jax.random.normal(agent.rng, (84, 84, 3)) new_timestep = dm_env.TimeStep(dm_env.StepType.MID, 1.0, agent.hparams.discount, y) agent.update(timestep, action, new_timestep) return agent
def step(self, action: np.int32) -> dm_env.TimeStep: """Updates the environment given an action and returns a timestep.""" # If the previous timestep was LAST then we call reset() on the Gym # environment, otherwise step(). Although Gym environments allow you to step # through episode boundaries (similar to dm_env) they emit a warning. if self._start_of_episode: step_type = dm_env.StepType.FIRST observation = self._key_door_env.reset_environment() discount = None reward = None done = False else: reward, observation = self._key_door_env.step(action) done = not self._key_door_env.active info = "" if done: assert "TimeLimit.truncated" not in info, "Should never truncate." step_type = dm_env.StepType.LAST discount = 0.0 else: step_type = dm_env.StepType.MID discount = 1.0 lives = np.int32(1) timestep = dm_env.TimeStep( step_type=step_type, observation=(observation, lives), reward=reward, discount=discount, ) self._start_of_episode = done return timestep
def step(self, action): """Apply action, step the world forward, and return observations.""" # If needed, reset and start new episode. if self._state == dm_env.StepType.LAST: self._clear_state() if self._current_game is None: return self.reset() # Execute the action in pycolab. observation, reward, discount = self._current_game.play(action) self._game_over = self._is_game_over() reward = reward if reward is not None else 0. observation = self._render_observation(observation) # Check the current status of the game. if self._game_over: self._state = dm_env.StepType.LAST else: self._state = dm_env.StepType.MID return dm_env.TimeStep(step_type=self._state, reward=reward, discount=discount, observation=observation)
def step(self, action: t.Tuple[np.int8, np.int8]) \ -> dm_env.TimeStep: """ Update the environment given an action. Returns ------- dm_env.TimeStep Timestep resulting from the action. """ # If the previous timestep was LAST then call reset() on the Simulator # environment to launch a new episode, otherwise step(). if self._start_of_episode: observation = self._sim.reset(self._sim_input, False) discount = None finished = False reward = None step_type = dm_env.StepType.FIRST else: observation, reward, finished, _ = self._sim.step(action) step_type = dm_env.StepType.LAST \ if finished \ else dm_env.StepType.MID # TODO: Choose discount value discount = tuple([float(step_type == dm_env.StepType.MID)] * 2) self._start_of_episode = finished return dm_env.TimeStep(discount=discount, observation=observation, reward=reward, step_type=step_type)
def convert_seq_timestep_and_actions_to_parallel( timesteps: Dict[str, SeqTimestepDict], possible_agents: list) -> Tuple[dict, dm_env.TimeStep]: step_types = [ timesteps[agent]["timestep"].step_type for agent in possible_agents ] assert all( x == step_types[0] for x in step_types), f"Step types should be identical - {step_types} " parallel_timestep = dm_env.TimeStep( observation={ agent: timesteps[agent]["timestep"].observation for agent in possible_agents }, reward={ agent: timesteps[agent]["timestep"].reward for agent in possible_agents }, discount={ agent: timesteps[agent]["timestep"].discount for agent in possible_agents }, step_type=step_types[0], ) parallel_actions = { agent: timesteps[agent]["action"] for agent in possible_agents } return parallel_actions, parallel_timestep
def step(self, actions): """Implements dm_env.Environment.step.""" step_response = self._connection.send( dm_env_rpc_pb2.StepRequest( requested_observations=self._requested_observation_uids, actions=self._action_specs.pack(actions))) observations = self._observation_specs.unpack(step_response.observations) if (step_response.state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING and self._last_state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING): step_type = dm_env.StepType.MID elif step_response.state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING: step_type = dm_env.StepType.FIRST elif self._last_state == dm_env_rpc_pb2.EnvironmentStateType.RUNNING: step_type = dm_env.StepType.LAST else: raise RuntimeError('Environment transitioned from {} to {}'.format( self._last_state, step_response.state)) self._last_state = step_response.state reward = self.reward( state=step_response.state, step_type=step_type, observations=observations) discount = self.discount( state=step_response.state, step_type=step_type, observations=observations) if not self._is_reward_requested: observations.pop(DEFAULT_REWARD_KEY, None) if not self._is_discount_requested: observations.pop(DEFAULT_DISCOUNT_KEY, None) return dm_env.TimeStep(step_type, reward, discount, observations)
def step( self, nn_actions: types.NestedArray) -> Tuple[dm_env.TimeStep, np.array]: """Steps the environment.""" if self._reset_next_step: return self.reset() actions = self._proc_robocup_actions(nn_actions) raw_obs, rewards, state, done = self._environment.step(actions) self._reset_next_step = done proc_obs = self._proc_robocup_obs(observations=raw_obs, done=done, nn_actions=nn_actions) proccessed_state = self._proc_robocup_state(state, proc_obs) if done: self._step_type = dm_env.StepType.LAST else: self._step_type = dm_env.StepType.MID return ( dm_env.TimeStep( observation=proc_obs, reward=rewards, discount=self._discount, step_type=self._step_type, ), { "env_state": proccessed_state }, )
def step(self, action): """Implementation of dm_env.step that supports repeated actions.""" discount = None reward = None self._events = [] for _ in range(self._num_action_repeats): next_timestep = super().step(action) # Accumulate reward per timestep. if next_timestep.reward is not None: reward = (reward or 0.) + next_timestep.reward # Calculate the product for discount. if next_timestep.discount is not None: discount = discount if discount else [] discount.append(next_timestep.discount) timestep = dm_env.TimeStep( next_timestep.step_type, reward, # Note: np.product(None) returns None. np.product(discount), next_timestep.observation) self._events.extend([ _unpack_world_event(event) for event in timestep.observation['events'] ]) if timestep.last(): return timestep return timestep
def test_transitions_returned_if_episode_length_less_than_n(self): f = dm_env.StepType.FIRST m = dm_env.StepType.MID l = dm_env.StepType.LAST n = 4 accumulator = replay_lib.NStepTransitionAccumulator(n) step_types = [f, m, l] num_timesteps = len(step_types) states = list(range(num_timesteps)) discounts = np.ones(num_timesteps) rewards = np.ones(num_timesteps) actions = np.ones(num_timesteps) accumulator_output = [] for i in range(num_timesteps): timestep = dm_env.TimeStep(step_type=step_types[i], observation=states[i], discount=discounts[i], reward=rewards[i]) accumulator_output.append( list(accumulator.step(timestep, actions[i]))) output_lengths = [len(output) for output in accumulator_output] output_states = [[(tr.s_tm1, tr.s_t) for tr in output] for output in accumulator_output] # Expect a 1-step transition and a 2-step transition after LAST timestep. expected_output_lengths = [0, 0, 2] expected_output_states = [[], [], [(0, 2), (1, 2)]] self.assertEqual(expected_output_lengths, output_lengths) self.assertEqual(expected_output_states, output_states)
def step(self, action: types.NestedArray) -> dm_env.TimeStep: """Steps the environment.""" if self._reset_next_step: return self.reset() open_spiel_timestep = self._environment.step(action) if open_spiel_timestep.step_type == rl_environment.StepType.LAST: self._reset_next_step = True observations = self._convert_observation(open_spiel_timestep) rewards = np.asarray(open_spiel_timestep.rewards) discounts = np.asarray(open_spiel_timestep.discounts) step_type = open_spiel_timestep.step_type if step_type == rl_environment.StepType.FIRST: step_type = dm_env.StepType.FIRST elif step_type == rl_environment.StepType.MID: step_type = dm_env.StepType.MID elif step_type == rl_environment.StepType.LAST: step_type = dm_env.StepType.LAST else: raise ValueError( "Did not recognize OpenSpiel StepType: {}".format(step_type)) return dm_env.TimeStep(observation=observations, reward=rewards, discount=discounts, step_type=step_type)
def step(self, action: Union[float, int, types.NestedArray]) -> dm_env.TimeStep: # Return a reset timestep if we haven't touched the environment yet. if not self._step: return self.reset() _validate_spec(self._spec.actions, action) observation = self._generate_fake_observation() reward = self._generate_fake_reward() discount = self._generate_fake_discount() self.agent_step_counter += 1 if self._episode_length and (self._step == self._episode_length): # Only reset step once all all agents have taken their turn. if self.agent_step_counter == len(self.agents): self._step = 0 self.agent_step_counter = 0 # We can't use dm_env.termination directly because then the discount # wouldn't necessarily conform to the spec (if eg. we want float32). return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: # Only update step counter once all agents have taken their turn. if self.agent_step_counter == len(self.agents): self._step += 1 self.agent_step_counter = 0 return dm_env.transition(reward=reward, observation=observation, discount=discount)
def step( self, actions: Dict[str, Union[float, int, types.NestedArray]] ) -> dm_env.TimeStep: # Return a reset timestep if we haven't touched the environment yet. if not self._step: return self.reset() for agent, action in actions.items(): _validate_spec(self._specs[agent].actions, action) observation = { agent: self._generate_fake_observation() for agent in self.agents } reward = {agent: self._generate_fake_reward() for agent in self.agents} discount = { agent: self._generate_fake_discount() for agent in self.agents } if self._episode_length and (self._step == self._episode_length): self._step = 0 # We can't use dm_env.termination directly because then the discount # wouldn't necessarily conform to the spec (if eg. we want float32). return dm_env.TimeStep(dm_env.StepType.LAST, reward, discount, observation) else: self._step += 1 return dm_env.transition(reward=reward, observation=observation, discount=discount)
def _send_observation(self, timestep: dm_env.TimeStep, player: int): # If terminal all actors must update if player == pyspiel.PlayerId.TERMINAL: for player_id in range(len(self._actors)): # Note: we must account for situations where the first observation # is a terminal state, e.g. if an opponent folds in poker before we get # to act. if self._observed_first[player_id]: player_timestep = self._get_player_timestep(timestep, player_id) self._actors[player_id].observe(self._prev_actions[player_id], player_timestep) if self._should_update: self._actors[player_id].update() self._observed_first = [False] * len(self._actors) self._prev_actions = [pyspiel.INVALID_ACTION] * len(self._actors) else: if not self._observed_first[player]: player_timestep = dm_env.TimeStep( observation=timestep.observation[player], reward=None, discount=None, step_type=dm_env.StepType.FIRST) self._actors[player].observe_first(player_timestep) self._observed_first[player] = True else: player_timestep = self._get_player_timestep(timestep, player) self._actors[player].observe(self._prev_actions[player], player_timestep) if self._should_update: self._actors[player].update()
def step(self, actions: Dict[str, np.array]) -> Tuple[dm_env.TimeStep, np.array]: """Steps the environment.""" if self._reset_next_step: self._reset_next_step = False self.reset() observations, rewards, dones, state_infos = self._environment.step(actions) if observations: observations = self._convert_observations(observations, dones) if self._environment.env_done: self._step_type = dm_env.StepType.LAST self._reset_next_step = True else: self._step_type = dm_env.StepType.MID return ( dm_env.TimeStep( observation=observations, reward=rewards, discount=self._discounts, step_type=self._step_type, ), state_infos, )
def step(self, action): """Implementation of dm_env.step that supports repeated actions.""" timestep = None discount = None reward = None for _ in range(self._num_action_repeats): next_timestep = super(_HardEightTasksEnv, self).step(action) # Accumulate reward per timestep. if next_timestep.reward is not None: reward = (reward or 0.) + next_timestep.reward # Calculate the product for discount. if next_timestep.discount is not None: discount = discount if discount else [] discount.append(next_timestep.discount) timestep = dm_env.TimeStep( next_timestep.step_type, reward, # Note: np.product(None) returns None. np.product(discount), next_timestep.observation) if timestep.last(): return timestep return timestep
def step(self, _): self.time_step += 1 if self.time_step == self.trial_start: self.configure_trial() self.tick() return dm_env.TimeStep(dm_env.StepType.MID, self.cumulant(), self.gamma, self.observation())
def _add_reward_noise(self, timestep: dm_env.TimeStep): if timestep.first(): return timestep reward = timestep.reward + self._noise_scale * self._rng.randn() return dm_env.TimeStep(step_type=timestep.step_type, reward=reward, discount=timestep.discount, observation=timestep.observation)
def _rescale_rewards(self, timestep: dm_env.TimeStep): if timestep.first(): return timestep reward = timestep.reward * self._reward_scale return dm_env.TimeStep(step_type=timestep.step_type, reward=reward, discount=timestep.discount, observation=timestep.observation)
def step(self, action: int) -> dm_env.TimeStep: """Steps up to action_repeat times and returns a post-processed step.""" if self._reset_next_step: return self.reset() timestep_stack = [] # Step on environment multiple times for each selected action. for _ in range(self._action_repeats): timestep = self._environment.step([np.array([action])]) self._episode_len += 1 if self._episode_len == self._max_episode_len: timestep = timestep._replace(step_type=dm_env.StepType.LAST) timestep_stack.append(timestep) if timestep.last(): # Action repeat frames should not span episode boundaries. Also, no need # to pad with zero-valued observations as all the reductions in # _postprocess_observation work gracefully for any non-zero size of # timestep_stack. self._reset_next_step = True break # Determine a single step type. We let FIRST take priority over LAST, since # we think it's more likely algorithm code will be set up to deal with that, # due to environments supporting reset() (which emits a FIRST). # Note we'll never have LAST then FIRST in timestep_stack here. step_type = dm_env.StepType.MID for timestep in timestep_stack: if timestep.first(): step_type = dm_env.StepType.FIRST break elif timestep.last(): step_type = dm_env.StepType.LAST break if timestep_stack[0].first(): # Update first timestep to have identity effect on reward and discount. timestep_stack[0] = timestep_stack[0]._replace(reward=0., discount=1.) # Sum reward over stack. reward = sum(timestep_t.reward for timestep_t in timestep_stack) # Multiply discount over stack (will either be 0. or 1.). discount = np.product( [timestep_t.discount for timestep_t in timestep_stack]) observation = self._observation_from_timestep_stack(timestep_stack) timestep = dm_env.TimeStep(step_type=step_type, reward=reward, observation=observation, discount=discount) return self._postprocess_observation(timestep)
def observe_first(self, timestep: dm_env.TimeStep): # Create a new timestep with the latent variable in the observation new_timestep = dm_env.TimeStep( step_type=timestep.step_type, reward=timestep.reward, discount=timestep.discount, observation=self._concat_latent_variable(timestep.observation), ) return self._agent._actor.observe_first(new_timestep)
def test_reset(self): self.accumulator.reset() transitions = self.accumulator.step(timestep_t=dm_env.TimeStep( step_type=dm_env.StepType.FIRST, observation=-1, discount=1., reward=3), a_t=1) self.assertEqual([], list(transitions))
def _set_step_type( self, timestep: dm_env.TimeStep, step_type: dm_env.StepType ) -> dm_env.TimeStep: return dm_env.TimeStep( observation=timestep.observation, reward=timestep.reward, discount=timestep.discount, step_type=step_type, )
def run(self, num_steps): """Perform the run loop. Args: num_steps: number of steps to run the loop for. """ current_steps = 0 while current_steps < num_steps: # Reset any counts and start the environment. start_time = time.time() self._rewarder.reset() episode_steps = 0 episode_return = 0 episode_imitation_return = 0 timestep = self._environment.reset() self._actor.observe_first(timestep) # Run an episode. while not timestep.last(): action = self._actor.select_action(timestep.observation) obs_act = { 'observation': timestep.observation, 'action': action } imitation_reward = self._rewarder.compute_reward(obs_act) timestep = self._environment.step(action) imitation_timestep = dm_env.TimeStep( step_type=timestep.step_type, reward=imitation_reward, discount=timestep.discount, observation=timestep.observation) self._actor.observe(action, next_timestep=imitation_timestep) self._actor.update() # Book-keeping. episode_steps += 1 episode_return += timestep.reward episode_imitation_return += imitation_reward # Collect the results and combine with counts. counts = self._counter.increment(episodes=1, steps=episode_steps) steps_per_second = episode_steps / (time.time() - start_time) result = { 'episode_length': episode_steps, 'episode_return': episode_return, 'episode_return_imitation': episode_imitation_return, 'steps_per_second': steps_per_second, } result.update(counts) self._logger.write(result) current_steps += episode_steps
def parameterized_restart( reward: types.Reward, discount: types.Discount, observation: types.Observation, ) -> dm_env.TimeStep: """Returns a `TimeStep` with `step_type` set to `StepType.FIRST`. Differs from dm_env.restart, since reward and discount can be set to initial types.""" return dm_env.TimeStep(dm_env.StepType.FIRST, reward, discount, observation)
def observe(self, action: types.NestedArray, next_timestep: dm_env.TimeStep): # Create a new timestep with the latent variable in the observation new_timestep = dm_env.TimeStep( step_type=next_timestep.step_type, reward=next_timestep.reward, discount=next_timestep.discount, observation=self._concat_latent_variable( next_timestep.observation), ) return self._agent._actor.observe(action, new_timestep)