Exemplo n.º 1
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation is None:
            self._current_observation = self.env.reset()
        self._current_observation = np.squeeze(
            self._current_observation).flatten()

        if explore:
            action = self.env.action_space.sample()
        else:
            action = self.agent.act(
                np.squeeze(self._current_observation).flatten())

        action = np.asarray(action)
        next_observation, reward, done, info = self.env.step(action)
        next_observation = np.squeeze(next_observation).flatten()
        reward = np.squeeze(reward).flatten()
        action = np.squeeze(action).flatten()
        done = np.squeeze(done)
        done = done.astype(np.int8)

        self._path_length += 1
        self._path_return += np.mean(reward)
        self._total_samples += 1
        self.agent.replay_buffer.add_sample(
            observation=self._current_observation,
            action=action,
            reward=reward,
            terminal=done,
            next_observation=next_observation,
        )

        self._current_observation = next_observation

        if np.all(done) or self._path_length >= self._max_path_length:
            self._max_path_return = np.maximum(self._max_path_return,
                                               self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self._terminal_position = self._current_observation

            self._current_observation = self.env.reset()
            self._path_length = 0
            self._path_return = np.zeros(1)
            self._n_episodes += 1

            # FIXME : delete it afterwards.
            if explore is False:
                self.episode_rewards.append(self._last_path_return.item())
                self.episode_positions.append([
                    self._terminal_position[0].item(),
                    self._terminal_position[1].item(),
                ])

            self.log_diagnostics()
            logger.log(tabular)
            logger.dump_all()

        else:
            self._current_observation = next_observation
Exemplo n.º 2
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        if explore:
            action_n = self.env.action_spaces.sample()
        else:
            for agent, current_observation in zip(self.agents,
                                                  self._current_observation_n):
                action = agent.act(current_observation.astype(np.float32))
                action_n.append(np.array(action))

        action_n = np.asarray(action_n)

        next_observation_n, reward_n, done_n, info = self.env.step(action_n)
        infoif = False
        if infoif:
            action_n = info["new_act"]
        if self._global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1
        for i, agent in enumerate(self.agents):
            opponent_action = action_n[[
                j for j in range(len(action_n)) if j != i
            ]].flatten()
            agent.replay_buffer.add_sample(
                observation=self._current_observation_n[i].astype(np.float32),
                action=action_n[i].astype(np.float32),
                reward=reward_n[i].astype(np.float32),
                terminal=done_n[i],
                next_observation=next_observation_n[i].astype(np.float32),
                opponent_action=opponent_action.astype(np.float32),
            )

        self._current_observation_n = next_observation_n

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return,
                                               self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self.container["path_rw"].append(self._path_return)
            self.container["mean_rw"].append(self._mean_path_return)
            self._path_length = 0
            self._path_return = np.zeros(self.agent_num)
            self._n_episodes += 1
            self.log_diagnostics()
            #for i, agent in enumerate(self.agents):
            #    try:

            logger.log(tabular)
            logger.dump_all()
        else:
            self._current_observation_n = next_observation_n
Exemplo n.º 3
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        # print(self._current_observation_n)
        # print(self._current_observation_n.shape)
        if explore:
            action_n = self.env.action_spaces.sample()
        else:
            for agent, current_observation in zip(self.agents, self._current_observation_n):
                action = agent.act(current_observation.astype(np.float32))
                action_n.append(np.array(action))

        action_n = np.asarray(action_n)

        next_observation_n, reward_n, done_n, info = self.env.step(action_n)    # stepping over, this env is the fortAttackGlobalenv
        
        if self._global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)
            
        self._path_length += 1
        self._last_path_return = np.array(reward_n, dtype=np.float32)
        self._path_return += self._last_path_return

        self._total_samples += 1
        for i, agent in enumerate(self.agents):
            opponent_action = action_n[[j for j in range(len(action_n)) if j != i]].flatten()
            agent.replay_buffer.add_sample(
                observation=self._current_observation_n[i].astype(np.float32),
                action=action_n[i].astype(np.float32),
                reward=np.array(reward_n[i],np.float32),
                terminal=done_n[i],
                next_observation=next_observation_n[i].astype(np.float32),
                opponent_action=opponent_action.astype(np.float32)
            )


        if self.render_after!=None:
            if self._n_episodes % self.render_after == 0:
               # render(self.env,
               #        "/tmp/episode_%08d" % self._path_length,
               #        self._path_length,)
               self.env.render(mode="rgb_array")[0]
               # time.sleep(0.03)
        self._current_observation_n = next_observation_n

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            # self._last_path_return = self._path_return
            # print('last path return', self._path_return)
            #if self._n_episodes % 100 == 0:
            #    render(self.env,
            #           "/tmp/episode_%08d" % self._path_length,
            #           self._path_length,
            #           True)
            self._path_length = 0
            self._path_return = np.zeros(self.agent_num)
            self._n_episodes += 1
            self.log_diagnostics()  # one of these lines is printing to screen
            logger.log(tabular)
            logger.dump_all()
            
            self._current_observation_n = self.env.reset()
        else:
            self._current_observation_n = next_observation_n