Пример #1
0
    def simulation(self, selected_node):

        schafkopf_env = SchafkopfEnv()

        #state, reward, terminal = schafkopf_env.set_state(deepcopy(selected_node.game_state), deepcopy(selected_node.player_hands))
        state, reward, terminal = schafkopf_env.set_state(
            deepcopy(selected_node.game_state),
            [copy(selected_node.player_hands[i]) for i in range(4)])
        while not terminal:
            action, _ = self.player.act(state)
            state, reward, terminal = schafkopf_env.step(action)

        return reward
Пример #2
0
def play_against_other_players(checkpoint_folder, model_class, other_player_classes, runs, summary_writer):

  generations = [int(f[:8]) for f in listdir(checkpoint_folder) if f.endswith(".pt")]
  max_gen = max(generations)
  policy = model_class()
  policy.to(device=Settings.device)
  policy.load_state_dict(torch.load(checkpoint_folder + "/" + str(max_gen).zfill(8) + ".pt"))

  for other_player_class in other_player_classes:

    players = [other_player_class(), RlPlayer(policy), other_player_class(), RlPlayer(policy)]
    schafkopf_env = SchafkopfEnv(1)

    all_rewards = np.array([0., 0., 0., 0.])
    for j in range(runs):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

      all_rewards += reward

    all_rewards = all_rewards[[1, 0, 3, 2]]

    players = [RlPlayer(policy), other_player_class(), RlPlayer(policy), other_player_class()]
    schafkopf_env = SchafkopfEnv(1)

    for j in range(runs):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

      all_rewards += reward

    summary_writer.add_scalar('Evaluation/' + str(other_player_class.__name__),
                              (all_rewards[0] + all_rewards[2]) / (4 * runs), max_gen)
Пример #3
0
    def expand(self, node):
        not_visited_actions = copy(node.allowed_actions)
        for child in node.children:
            not_visited_actions.remove(child.previous_action)

        #TODO: check if this should be random or chosen by player policy
        chosen_action = random.choice(tuple(not_visited_actions))

        schafkopf_env = SchafkopfEnv()
        schafkopf_env.set_state(deepcopy(node.game_state),
                                [copy(node.player_hands[i]) for i in range(4)])
        state, _, terminal = schafkopf_env.step(chosen_action)

        new_node = Node(parent=node,
                        game_state=state["game_state"],
                        previous_action=chosen_action,
                        player_hands=schafkopf_env.player_cards,
                        allowed_actions=state["allowed_actions"])
        node.add_child(child_node=new_node)
        return new_node
Пример #4
0
def main():

    pimc_player = PIMCPlayer(10, 40, RandomPlayer())

    policy = ActorCriticNetworkLSTM().to(Settings.device)
    policy.load_state_dict(torch.load("../policies/pretrained/lstm-policy.pt"))
    rl_player = RlPlayer(policy, action_shaping=False, eval=True)

    hp = HandPredictor().to(Settings.device)
    hp.load_state_dict(torch.load("../policies/pretrained/hand-predictor.pt"))
    smart_pimc_player = HPPIMCPlayer(10, 40, RandomPlayer(),
                                     HandPredictor().to(Settings.device))

    ip = ImmitationPolicy().to(Settings.device)
    ip.load_state_dict(torch.load("../policies/00010340.pt"))
    immitation_player = RlPlayer(ip, action_shaping=False, eval=True)

    participants = [
        rl_player,
        immitation_player,
        smart_pimc_player,
        pimc_player,
        RuleBasedPlayer(),
        RandomCowardPlayer(),
        RandomPlayer(),
    ]

    number_of_games = 1000

    for i in range(len(participants)):
        for j in range(i + 1, len(participants)):
            p1 = participants[i]
            p2 = participants[j]

            cummulative_reward = [0, 0, 0, 0]
            for k in range(
                    2
            ):  #run the same tournament twice with differen positions of players
                print(' ')
                schafkopf_env = SchafkopfEnv(seed=1)
                if k == 0:
                    players = [p1, p1, p2, p2]
                else:
                    players = [p2, p2, p1, p1]
                    cummulative_reward.reverse()

                # tournament loop
                for game_nr in range(1, number_of_games + 1):
                    state, reward, terminal = schafkopf_env.reset()
                    while not terminal:
                        action, prob = players[
                            state["game_state"].current_player].act(state)
                        state, reward, terminal = schafkopf_env.step(
                            action, prob)

                    cummulative_reward = [
                        cummulative_reward[m] + reward[m] for m in range(4)
                    ]

                    if game_nr % 100 == 0:
                        print('.', end='')
                    #schafkopf_env.print_game()

            print("player " + str(i) + " vs. player " + str(j) + " = " +
                  str((cummulative_reward[2] + cummulative_reward[3]) /
                      (2 * 2 * number_of_games)) + " to " +
                  str((cummulative_reward[0] + cummulative_reward[1]) /
                      (2 * 2 * number_of_games)))
Пример #5
0
def main():

  print("Cuda available: "+str(torch.cuda.is_available()))

  #start tensorboard
  tb = program.TensorBoard()
  tb.configure(argv=[None, '--logdir', Settings.runs_folder])
  tb.launch()

  # set seed for debugging
  if Settings.random_seed:
      torch.manual_seed(Settings.random_seed)

  #loading initial policy
  policy = Settings.model().to(Settings.device)
  # take the newest generation available
  i_episode = max_gen = 0
  generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")]
  if len(generations) > 0:
      max_gen = max(generations)
      policy.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt"))
      i_episode = max_gen
  #create ppo
  ppo = PPO(policy, [Settings.lr, Settings.lr_stepsize, Settings.lr_gamma], Settings.betas, Settings.gamma, Settings.K_epochs, Settings.eps_clip, Settings.batch_size,Settings.mini_batch_size, c1=Settings.c1, c2=Settings.c2, start_episode=max_gen-1  )

  #create four players
  players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]
  #create a game simulation
  schafkopf_env = SchafkopfEnv(Settings.random_seed)
  game_statistics = GameStatistics()

  # training loop
  for _ in range(0, 90000000):
    Settings.logger.info("playing " +str(Settings.update_games)+ " games")

    # play a bunch of games
    t0 = time.time()
    for _ in range(Settings.update_games):
      state, reward, terminal = schafkopf_env.reset()
      while not terminal:
        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)
      for p in range(4):
        players[p].retrieve_reward(reward[p])
      i_episode += 1
      game_statistics.update_statistics(state["game_state"], reward)
    t1 = time.time()

    #update the policy
    Settings.logger.info("updating policy")

    player_memories = Memory()
    for p in players:
      player_memories.append_memory(p.memory)

    ppo.update(player_memories, i_episode)
    t2 = time.time()
    ppo.lr_scheduler.step(i_episode)

    # writing game statistics for tensorboard
    Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1))
    schafkopf_env.print_game()
    game_statistics.write_and_reset (i_episode)

    # reset memories and replace policy
    players = [RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old), RlPlayer(ppo.policy_old)]

    # save and evaluate the policy
    Settings.logger.info("Saving Checkpoint")
    torch.save(ppo.policy_old.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt")
    Settings.logger.info("Evaluation")
    play_against_other_players(Settings.checkpoint_folder, Settings.model, [RandomPlayer, RandomCowardPlayer, RuleBasedPlayer], Settings.eval_games,
                               Settings.summary_writer)
Пример #6
0
    def sample_player_hands(self,
                            game_state,
                            ego_player_hand,
                            card_probabilities,
                            remaining_cards,
                            needed_player_cards,
                            only_valid=False):

        valid_card_distribution = False
        player_cards = None

        # loop over random card distributions until we found a valid one
        while not valid_card_distribution:
            # randomly distribute cards so that each player gets as many as he needs
            valid_card_distribution = True
            player_cards = [[], [], [], []]
            player_cards[game_state.current_player] = ego_player_hand
            random.shuffle(remaining_cards)

            card_probs = card_probabilities.clone().detach().cpu()

            for p in range(4):
                index = (p - game_state.current_player - 1) % 4
                if len(player_cards[p]) >= needed_player_cards[p]:
                    card_probs[:, index] = 0

            for card in remaining_cards:
                card_index = card[1] * 4 + card[
                    0]  #self.rules.cards.index(card)

                sample_player = Categorical(card_probs[card_index]).sample()

                player_id = (game_state.current_player + sample_player + 1) % 4
                player_cards[player_id].append(card)

                if len(player_cards[player_id]
                       ) == needed_player_cards[player_id]:
                    card_probs[:, sample_player] = 0

            #from_card = 0
            #for i, nededed_cards in enumerate(needed_player_cards):
            #  if i == game_state.current_player:
            #    continue
            #  player_cards[i] = remaining_cards[from_card:from_card + nededed_cards]
            #  from_card += nededed_cards
            if not only_valid:
                break

            # check if with the current card distribution every made move was valid

            schafkopf_env = SchafkopfEnv()
            simulation_player_cards = [
                player_hand.copy() for player_hand in player_cards
            ]
            for i in range(4):
                simulation_player_cards[i] += [
                    game_state.course_of_game_playerwise[trick][i]
                    for trick in range(8)
                    if game_state.course_of_game_playerwise[trick][i] !=
                    [None, None]
                ]

            state, _, _ = schafkopf_env.set_state(
                PublicGameState(game_state.dealer), simulation_player_cards)

            while True:
                eval_game_state, allowed_actions = state["game_state"], state[
                    "allowed_actions"]

                if eval_game_state.game_stage == Rules.BIDDING:
                    action = game_state.bidding_round[
                        eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                elif eval_game_state.game_stage == Rules.CONTRA:
                    action = game_state.contra[eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                elif eval_game_state.game_stage == Rules.RETOUR:
                    action = game_state.retour[eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                else:
                    action = game_state.course_of_game_playerwise[
                        eval_game_state.trick_number][
                            eval_game_state.current_player]
                    if action == [None, None]:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                state, _, _ = schafkopf_env.step(action)
        return player_cards
Пример #7
0
def main():

  print("Cuda available: "+str(torch.cuda.is_available()))

  #start tensorboard
  tb = program.TensorBoard()
  tb.configure(argv=[None, '--logdir', Settings.runs_folder])
  tb.launch()

  # set seed for debugging
  if Settings.random_seed:
      torch.manual_seed(Settings.random_seed)

  #loading initial policy
  hand_predictor = HandPredictor().to(Settings.device)
  # take the newest generation available
  i_episode = max_gen = 0
  generations = [int(f[:8]) for f in listdir(Settings.checkpoint_folder) if f.endswith(".pt")]
  if len(generations) > 0:
      max_gen = max(generations)
      hand_predictor.load_state_dict(torch.load(Settings.checkpoint_folder+"/" + str(max_gen).zfill(8) + ".pt"))
      i_episode = max_gen

  optimizer = torch.optim.Adam(hand_predictor.parameters(),lr=Settings.lr, betas=Settings.betas, weight_decay=Settings.optimizer_weight_decay)

  # training loop
  for _ in range(0, 90000000):
    Settings.logger.info("playing " +str(Settings.update_games)+ " games")

    smart_mcts_player = HPPIMCPlayer(30, 120, RandomPlayer(), hand_predictor)
    # create four players
    players = [smart_mcts_player, smart_mcts_player, smart_mcts_player, smart_mcts_player]
    # create a game simulation
    schafkopf_env = SchafkopfEnv(Settings.random_seed)
    game_statistics = GameStatistics()


    memory_states = []
    memory_player_hands = []

    # play a bunch of games
    t0 = time.time()
    for _ in range(Settings.update_games):
      state, reward, terminal = schafkopf_env.reset()

      while not terminal:
        memory_states.append(hand_predictor.preprocess(state)) #TODO: happens twice now and could be optimized
        memory_player_hands.append(hand_predictor.encode_player_hands(schafkopf_env.player_cards, state["game_state"].current_player))

        action, prob = players[state["game_state"].current_player].act(state)
        state, reward, terminal = schafkopf_env.step(action, prob)

        if state["game_state"].game_type[1] == 2:
          schafkopf_env.print_game()

      print("game "+str(i_episode))
      i_episode += 1
      game_statistics.update_statistics(state["game_state"], reward)
    t1 = time.time()

    #update the policy
    Settings.logger.info("updating policy")
    # Create dataset from collected experiences
    dataset = PredictionDatasetLSTM(memory_states, memory_player_hands)
    training_generator = data.DataLoader(dataset, collate_fn=dataset.custom_collate,batch_size=Settings.mini_batch_size, shuffle=True)

    #logging
    avg_loss = 0
    count = 0

    hand_predictor.train()
    for epoch in range(Settings.K_epochs):  # epoch

      mini_batches_in_batch = int(Settings.batch_size / Settings.mini_batch_size)
      optimizer.zero_grad()

      for i, (states, hands) in enumerate(training_generator):  # mini batch
        # Transfer to GPU
        states = [state.to(Settings.device) for state in states]
        hands = hands.to(Settings.device)
        pred = hand_predictor(states)
        #loss = nn.MSELoss()(pred, hands) #TODO: replace by cross entropy
        loss = nn.BCELoss()(pred, hands)

        avg_loss += loss.mean().item()
        count +=1

        loss.mean().backward()

        if (i + 1) % mini_batches_in_batch == 0:
          optimizer.step()
          optimizer.zero_grad()
    t2 = time.time()
    hand_predictor.eval()

    # writing game statistics for tensorboard
    Settings.logger.info("Episode: "+str(i_episode) + " game simulation (s) = "+str(t1-t0) + " update (s) = "+str(t2-t1))
    schafkopf_env.print_game()
    game_statistics.write_and_reset (i_episode)
    Settings.summary_writer.add_scalar('Loss/MSE_Loss', avg_loss / count, i_episode)

    # save and evaluate the policy
    Settings.logger.info("Saving Checkpoint")
    torch.save(hand_predictor.state_dict(), Settings.checkpoint_folder + "/" + str(i_episode).zfill(8) + ".pt")
    Settings.logger.info("Evaluation")
Пример #8
0
def get_states_actions(game_transcript, policy):

  states = []
  actions = []

  schafkopf_env = SchafkopfEnv()

  state, _, _ = schafkopf_env.set_state(PublicGameState(3), [game_transcript.player_hands[i] for i in range(4)])
  states.append(policy.preprocess(state))

  #Bidding stage

  game_player = None
  game_type = None

  if len(game_transcript.bidding_round) != 4:  # not all said weiter
    player_bidding = None
    for i in range(1, 5):
      if "Vortritt" not in game_transcript.bidding_round[-i]:
        player_bidding = game_transcript.bidding_round[-i]
        break

    if player_bidding.startswith("Ex-Sauspieler"):
      game_player = game_transcript.player_dict[player_bidding.split(" ")[0] + " " + player_bidding.split(" ")[1]]
    else:
      game_player = game_transcript.player_dict[player_bidding.split(" ")[0]]

    player_bidding = player_bidding.split(' ', 1)[1] #remove player name in case it contains one of the following words
    if "Hundsgfickte" in player_bidding:
      game_type = [0, 0]
    elif "Blaue" in player_bidding:
      game_type = [2, 0]
    elif "Alte" in player_bidding:
      game_type = [3, 0]
    elif "Schelle" in player_bidding:
      game_type = [0, 2]
    elif "Herz" in player_bidding:
      game_type = [1, 2]
    elif "Gras" in player_bidding:
      game_type = [2, 2]
    elif "Eichel" in player_bidding:
      game_type = [3, 2]
    elif "Wenz" in player_bidding:
      game_type = [None, 1]


  for i in range(4):
    action = [None, None]
    if i == game_player:
      action = game_type
    actions.append(preprocess_action(Rules.BIDDING, action))
    state, _, _ = schafkopf_env.step(action)
    if not (len(game_transcript.bidding_round) == 4 and i == 3): #don't take the last state of the game into the dataset
      states.append(policy.preprocess(state))

  if len(game_transcript.bidding_round) != 4: # if not all said weiter

    con_ret = [game_transcript.player_dict[p] for p in game_transcript.kontra]

    #CONTRA stage
    for i in range(4):
      action = False
      if len(con_ret) > 0 and i == con_ret[0]:
        action = True
      actions.append(preprocess_action(Rules.CONTRA, action))
      state, _, _ = schafkopf_env.step(action)
      states.append(policy.preprocess(state))

    # RETOUR stage
    if len(con_ret) > 0:
      for i in range(4):
        action = False
        if len(con_ret) == 2 and i == con_ret[1]:
          action = True
        actions.append(preprocess_action(Rules.RETOUR, action))
        state, _, _ = schafkopf_env.step(action)
        states.append(policy.preprocess(state))

    # TRICK stage

    for trick in range(8):
      for c in range(4):
        action = game_transcript.course_of_game[trick][c]
        actions.append(preprocess_action(Rules.TRICK, action))
        state, _, _ = schafkopf_env.step(action)
        if not (trick == 7 and c == 3): # all but the last state
          states.append(policy.preprocess(state))

  return states, actions
Пример #9
0
    def sample_player_hands(self, game_state, ego_player_hand):

        # precomputations
        played_cards = [
            card for trick in game_state.course_of_game for card in trick
            if card != [None, None]
        ]
        remaining_cards = [
            card for card in self.rules.cards
            if ((card not in played_cards) and (card not in ego_player_hand))
        ]

        needed_player_cards = [8, 8, 8, 8]

        for trick in range(game_state.trick_number + 1):
            for i, card in enumerate(
                    game_state.course_of_game_playerwise[trick]):
                if card != [None, None]:
                    needed_player_cards[i] -= 1

        needed_player_cards[game_state.current_player] = 0

        valid_card_distribution = False
        player_cards = None

        # loop over random card distributions until we found a valid one
        while not valid_card_distribution:

            # randomly distribute cards so that each player gets as many as he needs
            valid_card_distribution = True
            player_cards = [[], [], [], []]
            player_cards[game_state.current_player] = ego_player_hand
            random.shuffle(remaining_cards)

            from_card = 0
            for i, nededed_cards in enumerate(needed_player_cards):
                if i == game_state.current_player:
                    continue
                player_cards[i] = remaining_cards[from_card:from_card +
                                                  nededed_cards]
                from_card += nededed_cards

            # check if with the current card distribution every made move was valid
            schafkopf_env = SchafkopfEnv()
            state, _, _ = schafkopf_env.set_state(
                PublicGameState(game_state.dealer), player_cards)

            while True:
                eval_game_state, allowed_actions = state["game_state"], state[
                    "allowed_actions"]

                if eval_game_state.game_stage == Rules.BIDDING:
                    action = eval_game_state.bidding_round[
                        eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                elif eval_game_state.game_stage == Rules.CONTRA:
                    action = eval_game_state.contra[
                        eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                elif eval_game_state.game_stage == Rules.RETOUR:
                    action = eval_game_state.retour[
                        eval_game_state.current_player]
                    if action == None:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                else:
                    action = eval_game_state.course_of_game_playerwise[
                        eval_game_state.trick_number][
                            eval_game_state.current_player]
                    if action == [None, None]:
                        break
                    elif action not in allowed_actions:
                        valid_card_distribution = False
                        break
                state, _, _ = schafkopf_env.step(action)

        return player_cards