def arrange_match(self): self.episode_count = self.episode_count + 1 brain_name = self.env.brain_names[0] env_info = self.env.reset(train_mode=True)[brain_name] state = env_info.vector_observations episode_reward1 = 0 episode_reward2 = 0 for step in range(self.max_steps): actions = [self.agent.get_action(state[0]), self.agent.get_action(state[1])] env_info = self.env.step(actions)[brain_name] reward = env_info.rewards next_state = env_info.vector_observations done = env_info.local_done self.agent.learn_experience(buffer.Experience(state[0], actions[0], reward[0], next_state[0], done[0])) self.agent.learn_experience(buffer.Experience(state[1], actions[1], reward[1], next_state[1], done[1])) episode_reward1 += reward[0] episode_reward2 += reward[1] if done[0] or done[1] or step == self.max_steps-1: break state = next_state return max(episode_reward1, episode_reward2), step
def learn_episode(self, max_steps): brain_name = self.env.brain_names[0] env_info = self.env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] episode_reward = 0 for step in range(max_steps): action = self.get_action(state) env_info = self.env.step(action)[brain_name] reward = env_info.rewards[0] next_state = env_info.vector_observations[0] done = env_info.local_done[0] self.replay_buffer.add(buffer.Experience(state, action, reward, next_state, done)) episode_reward += reward if self.replay_buffer.ready_to_sample(): self.update() if done or step == max_steps-1: break state = next_state return episode_reward
def play_episode(self): brain_name = self.env.brain_names[0] env_info = self.env.reset(train_mode=True)[brain_name] state = env_info.vector_observations episode_reward1 = 0 episode_reward2 = 0 for step in range(self.max_steps): actions = self.agent.get_actions(state) env_info = self.env.step(actions)[brain_name] reward = env_info.rewards next_state = env_info.vector_observations done = env_info.local_done self.agent.learn_experience(buffer.Experience(state, actions, reward, next_state, done)) episode_reward1 += reward[0] episode_reward2 += reward[1] if done[0] or done[1] or step == self.max_steps-1: break state = next_state return max(episode_reward1, episode_reward2), step
def arrange_match(self): self.episode_count = self.episode_count + 1 if self.steps_till_freeze < 0: self.steps_till_freeze = self.freeze_steps self.frozen_agents[self.next_freeze_agent].copy_and_freeze(self.agent) self.next_freeze_agent = (self.next_freeze_agent + 1) % len(self.frozen_agents) brain_name = self.env.brain_names[0] env_info = self.env.reset(train_mode=True)[brain_name] state = env_info.vector_observations episode_reward1 = 0 episode_reward2 = 0 if random.random() < self.self_play_probability: opponent_agent = self.agent else: opponent_agent = random.choice(self.frozen_agents) for step in range(self.max_steps): actions = [self.agent.get_action(state[0]), opponent_agent.get_action(state[1])] env_infos = self.env.step(actions) reward = env_infos[brain_name].rewards next_state = env_infos[brain_name].vector_observations done = env_infos[brain_name].local_done self.agent.learn_experience(buffer.Experience(state[0], actions[0], reward[0], next_state[0], done[0])) self.agent.learn_experience(buffer.Experience(state[1], actions[1], reward[1], next_state[1], done[1])) episode_reward1 += reward[0] episode_reward2 += reward[1] if done[0] or done[1] or step == self.max_steps-1: break self.steps_till_freeze -= 1 state = next_state return max(episode_reward1, episode_reward2), step