예제 #1
0
    def train(self):
        policies = {0: self.policy.clone_policy()}
        try:
            for episode_number in range(1, self.episodes + 1):
                print(f"training on episode {episode_number}/{self.episodes}")
                episode = Episode(self.policy, self.epsilon)
                episode_states, episode_distributions, rewards, winner = episode.play()
                self.states += episode_states
                self.distributions += episode_distributions
                self.rewards += rewards
                self.train_policy()
                self.epsilon -= self.epsilon_decay_rate
                if episode_number % (self.episodes // self.amount_of_players) == 0:
                    policy_to_save = self.policy.clone_policy()
                    policies[episode_number] = policy_to_save
        except (KeyboardInterrupt):
            policy_to_save = self.policy.clone_policy()
            policies[episode_number] = policy_to_save
            return policies

        return policies