def reset(self) -> dm_env.TimeStep: """Resets the episode.""" self._reset_next_step = False self.dones = {agent: False for agent in self.possible_agents} self._agents = self._possible_agents[:] self._prev_timestep: rl_environment.TimeStep = None self._current_player_id = 0 opnspl_tmstep = self._environment.reset() agent = self.current_agent done = self.dones[agent] observe = self._to_observation(opnspl_tmstep) observation = self._convert_observation(agent, observe, done) self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1) reward = convert_np_type( self.reward_spec()[agent].dtype, (opnspl_tmstep.rewards[self.current_player_id] if opnspl_tmstep.rewards else 0), ) return parameterized_restart(reward, self._discount, observation)
def reset(self) -> dm_env.TimeStep: """Resets the episode.""" self._reset_next_step = False self._step_type = dm_env.StepType.FIRST discount_spec = self.discount_spec() observe = self._environment.reset() self._discounts = { agent: convert_np_type(discount_spec[agent].dtype, 1) for agent in self.possible_agents } if type(observe) == tuple: observe, env_extras = observe else: env_extras = {} observations = self._convert_observations( observe, {agent: False for agent in self.possible_agents}) rewards_spec = self.reward_spec() rewards = { agent: convert_np_type(rewards_spec[agent].dtype, 0) for agent in self.possible_agents } return parameterized_restart(rewards, self._discounts, observations), env_extras
def reset(self) -> Tuple[dm_env.TimeStep, np.array]: """Resets the episode.""" self._reset_next_step = False self._step_type = dm_env.StepType.FIRST discount_spec = self.discount_spec() self._discounts = { agent: convert_np_type(discount_spec[agent].dtype, 1) for agent in self._environment.possible_agents } observe, state_infos = self._environment.reset() observations = self._convert_observations( observe, {agent: False for agent in self.possible_agents} ) rewards_spec = self.reward_spec() rewards = { agent: convert_np_type(rewards_spec[agent].dtype, 0) for agent in self.possible_agents } discount_spec = self.discount_spec() self._discounts = { agent: convert_np_type(discount_spec[agent].dtype, 1) for agent in self.possible_agents } return ( parameterized_restart(rewards, self._discounts, observations), state_infos, )
def reset(self) -> dm_env.TimeStep: observation = self._generate_fake_observation() discount = convert_np_type("float32", 1) # Not used in pettingzoo reward = convert_np_type("float32", 0) self._step = 1 return parameterized_restart(reward=reward, discount=discount, observation=observation)
def reset(self) -> Tuple[dm_env.TimeStep, np.array]: """Resets the env and returns observations from ready agents. Returns: obs (dict): New observations for each ready agent. """ self._env_done = False self._reset_next_step = False self._step_type = dm_env.StepType.FIRST # reset internal SC2 env obs_list, state = self._environment.reset() # Convert observations observe: Dict[str, np.ndarray] = {} for i, obs in enumerate(obs_list): observe[f"agent_{i}"] = { "observation": obs, "action_mask": np.array(self._environment.get_avail_agent_actions(i), dtype=np.float32), } observations = self._convert_observations( observe, {agent: False for agent in self._possible_agents}) self._agents = list(observe.keys()) # create discount spec discount_spec = self.discount_spec() self._discounts = { agent: convert_np_type(discount_spec[agent].dtype, 1) for agent in self._possible_agents } # create rewards spec rewards_spec = self.reward_spec() rewards = { agent: convert_np_type(rewards_spec[agent].dtype, 0) for agent in self._possible_agents } # dm_env timestep timestep = parameterized_restart(rewards, self._discounts, observations) return timestep, {"s_t": state}
def reset(self) -> dm_env.TimeStep: observations = {} for agent in self.agents: observation = self._generate_fake_observation() observations[agent] = observation rewards = { agent: convert_np_type("float32", 0) for agent in self.agents } discounts = { agent: convert_np_type("float32", 1) for agent in self.agents } self._step = 1 return parameterized_restart(rewards, discounts, observations) # type: ignore
def reset(self) -> dm_env.TimeStep: """Resets the episode.""" self._reset_next_step = False self._environment.reset() self._step_types = { agent: dm_env.StepType.FIRST for agent in self.possible_agents } self._first_step_performed = { agent: False for agent in self.possible_agents } observe, _, done, _ = self._environment.last() agent = self.current_agent observation = self._convert_observation(agent, observe, done) self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1) reward = convert_np_type(self.reward_spec()[agent].dtype, 0) return parameterized_restart(reward, self._discount, observation)
def reset(self) -> dm_env.TimeStep: """Resets the episode.""" self._reset_next_step = False self._agents = self.possible_agents[:] self._discounts = { agent: np.dtype("float32").type(1.0) for agent in self.agents } observe, info = self._environment.reset() observations = self._create_observations(observe, info, self._environment.dones) rewards_spec = self.reward_spec() rewards = { agent: convert_np_type(rewards_spec[agent].dtype, 0) for agent in self.possible_agents } discount_spec = self.discount_spec() self._discounts = { agent: convert_np_type(discount_spec[agent].dtype, 1) for agent in self.possible_agents } return parameterized_restart(rewards, self._discounts, observations)