예제 #1
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self.dones = {agent: False for agent in self.possible_agents}
        self._agents = self._possible_agents[:]

        self._prev_timestep: rl_environment.TimeStep = None
        self._current_player_id = 0

        opnspl_tmstep = self._environment.reset()
        agent = self.current_agent
        done = self.dones[agent]

        observe = self._to_observation(opnspl_tmstep)
        observation = self._convert_observation(agent, observe, done)

        self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1)

        reward = convert_np_type(
            self.reward_spec()[agent].dtype,
            (opnspl_tmstep.rewards[self.current_player_id]
             if opnspl_tmstep.rewards else 0),
        )

        return parameterized_restart(reward, self._discount, observation)
예제 #2
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST
        discount_spec = self.discount_spec()
        observe = self._environment.reset()

        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }

        if type(observe) == tuple:
            observe, env_extras = observe
        else:
            env_extras = {}

        observations = self._convert_observations(
            observe, {agent: False
                      for agent in self.possible_agents})
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        return parameterized_restart(rewards, self._discounts,
                                     observations), env_extras
예제 #3
0
    def reset(self) -> Tuple[dm_env.TimeStep, np.array]:
        """Resets the episode."""
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST
        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self._environment.possible_agents
        }
        observe, state_infos = self._environment.reset()
        observations = self._convert_observations(
            observe, {agent: False for agent in self.possible_agents}
        )
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }
        return (
            parameterized_restart(rewards, self._discounts, observations),
            state_infos,
        )
예제 #4
0
파일: mocks.py 프로젝트: NetColby/DNRL
 def reset(self) -> dm_env.TimeStep:
     observation = self._generate_fake_observation()
     discount = convert_np_type("float32", 1)  # Not used in pettingzoo
     reward = convert_np_type("float32", 0)
     self._step = 1
     return parameterized_restart(reward=reward,
                                  discount=discount,
                                  observation=observation)
예제 #5
0
    def reset(self) -> Tuple[dm_env.TimeStep, np.array]:
        """Resets the env and returns observations from ready agents.
        Returns:
            obs (dict): New observations for each ready agent.
        """
        self._env_done = False
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST

        # reset internal SC2 env
        obs_list, state = self._environment.reset()

        # Convert observations
        observe: Dict[str, np.ndarray] = {}

        for i, obs in enumerate(obs_list):
            observe[f"agent_{i}"] = {
                "observation":
                obs,
                "action_mask":
                np.array(self._environment.get_avail_agent_actions(i),
                         dtype=np.float32),
            }

        observations = self._convert_observations(
            observe, {agent: False
                      for agent in self._possible_agents})

        self._agents = list(observe.keys())

        # create discount spec
        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self._possible_agents
        }

        # create rewards spec
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self._possible_agents
        }

        # dm_env timestep
        timestep = parameterized_restart(rewards, self._discounts,
                                         observations)

        return timestep, {"s_t": state}
예제 #6
0
파일: mocks.py 프로젝트: NetColby/DNRL
    def reset(self) -> dm_env.TimeStep:
        observations = {}
        for agent in self.agents:
            observation = self._generate_fake_observation()
            observations[agent] = observation

        rewards = {
            agent: convert_np_type("float32", 0)
            for agent in self.agents
        }
        discounts = {
            agent: convert_np_type("float32", 1)
            for agent in self.agents
        }

        self._step = 1
        return parameterized_restart(rewards, discounts,
                                     observations)  # type: ignore
예제 #7
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._environment.reset()
        self._step_types = {
            agent: dm_env.StepType.FIRST
            for agent in self.possible_agents
        }
        self._first_step_performed = {
            agent: False
            for agent in self.possible_agents
        }

        observe, _, done, _ = self._environment.last()
        agent = self.current_agent
        observation = self._convert_observation(agent, observe, done)

        self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1)

        reward = convert_np_type(self.reward_spec()[agent].dtype, 0)

        return parameterized_restart(reward, self._discount, observation)
예제 #8
0
파일: flatland.py 프로젝트: NetColby/DNRL
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._agents = self.possible_agents[:]
        self._discounts = {
            agent: np.dtype("float32").type(1.0)
            for agent in self.agents
        }
        observe, info = self._environment.reset()
        observations = self._create_observations(observe, info,
                                                 self._environment.dones)
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }
        return parameterized_restart(rewards, self._discounts, observations)