示例#1
0
 def get_player_memories(self, ids=None):
     """
 summarizes the memories (states, actions, rewards, ...) of selected players
 :param ids: the ids of the player memories
 :type ids: list
 :return: the combined memories of the selected players
 :rtype: memory
 """
     memory = Memory()
     if ids == None:
         ids = range(4)
     for i in ids:
         memory.append_memory(self.players[i].memory)
     return memory
示例#2
0
def main():

  print("Cuda available: "+str(torch.cuda.is_available()))

  #start tensorboard
  tb = program.TensorBoard()
  tb.configure(argv=[None, '--logdir', Settings.runs_folder])
  tb.launch()

  # set seed for debugging
  if Settings.random_seed:
      torch.manual_seed(Settings.random_seed)

  #loading initial policy
  policy = Settings.model().to(Settings.device)
  # take the newest generation available
  i_episode = max_gen = 0
  generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")]
  if len(generations) > 0:
      max_gen = max(generations)
      policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt"))
      i_episode = max_gen
  #create ppo
  ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1  )

  #create four players
  players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]
  #create a game simulation
  schafkopf_env = SchafkopfEnv(Settings.random_seed)
  game_statistics = GameStatistics()

  # training loop
  for _ in range(0, 90000000):
    Settings.logger.info("playing " +str(Settings.update_games)+ " games")

    # play a bunch of games
    t0 = time.time()
    for _ in range(Settings.update_games):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)
      for p in range(4):
        players[p].retrieve_reward(reward[p])
      i_episode += 1
      game_statistics.update_statistics(state["game_state"], reward)
    t1 = time.time()

    #update the policy
    Settings.logger.info("updating policy")

    player_memories = Memory()
    for p in players:
      player_memories.append_memory(p.memory)

    ppo.update(player_memories, i_episode)
    t2 = time.time()
    ppo.lr_scheduler.step(i_episode)

    # writing game statistics for tensorboard
    Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1))
    schafkopf_env.print_game()
    game_statistics.write_and_reset (i_episode)

    # reset memories and replace policy
    players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]

    # save and evaluate the policy
    Settings.logger.info("Saving Checkpoint")
    torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt")
    Settings.logger.info("Evaluation")
    play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games,
                               Settings.summary_writer)