def get_player_memories(self, ids=None): """ summarizes the memories (states, actions, rewards, ...) of selected players :param ids: the ids of the player memories :type ids: list :return: the combined memories of the selected players :rtype: memory """ memory = Memory() if ids == None: ids = range(4) for i in ids: memory.append_memory(self.players[i].memory) return memory
def main(): print("Cuda available: "+str(torch.cuda.is_available())) #start tensorboard tb = program.TensorBoard() tb.configure(argv=[None, '--logdir', Settings.runs_folder]) tb.launch() # set seed for debugging if Settings.random_seed: torch.manual_seed(Settings.random_seed) #loading initial policy policy = Settings.model().to(Settings.device) # take the newest generation available i_episode = max_gen = 0 generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")] if len(generations) > 0: max_gen = max(generations) policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt")) i_episode = max_gen #create ppo ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1 ) #create four players players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] #create a game simulation schafkopf_env = SchafkopfEnv(Settings.random_seed) game_statistics = GameStatistics() # training loop for _ in range(0, 90000000): Settings.logger.info("playing " +str(Settings.update_games)+ " games") # play a bunch of games t0 = time.time() for _ in range(Settings.update_games): state, reward, terminal = schafkopf_env.reset() while not terminal: action, prob = players[state["game_state"].current_player].act(state) state, reward, terminal = schafkopf_env.step(action, prob) for p in range(4): players[p].retrieve_reward(reward[p]) i_episode += 1 game_statistics.update_statistics(state["game_state"], reward) t1 = time.time() #update the policy Settings.logger.info("updating policy") player_memories = Memory() for p in players: player_memories.append_memory(p.memory) ppo.update(player_memories, i_episode) t2 = time.time() ppo.lr_scheduler.step(i_episode) # writing game statistics for tensorboard Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1)) schafkopf_env.print_game() game_statistics.write_and_reset (i_episode) # reset memories and replace policy players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)] # save and evaluate the policy Settings.logger.info("Saving Checkpoint") torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt") Settings.logger.info("Evaluation") play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games, Settings.summary_writer)