예제 #1
0
 def __init__(self, game: GameInterface):
     super().__init__()
     self.game = game
     self.actor_critics = torch.nn.ModuleList([
         ActorCritic(game.feature_dim(), game.action_dim())
         for _ in range(NUM_PARALLEL_MODELS)
     ])
     self.learning_rate = 0.01
예제 #2
0
def start_traverse(
    game: GameInterface,
    player_to_train: int,
    regretModels: List[Optional[RegretMatching]],
    strategyModels: List[Optional[RegretMatching]],
) -> Tuple[int, ExpandableTensorSet, ExpandableTensorSet, Counter]:
    lowpriority()
    NUM_INNER_GAME_ITERATIONS = 100
    with torch.no_grad():
        playerRegret = ExpandableTensorSet(
            16 * 1024,
            (game.feature_dim(), game.action_dim(), game.action_dim()))
        strategyData = ExpandableTensorSet(
            16 * 1024,
            (game.feature_dim(), game.action_dim(), game.action_dim()))
        metrics: Counter = Counter()
        for _ in range(NUM_INNER_GAME_ITERATIONS):
            ng = game.clone()
            ng.reset()
            with Profiler(False):
                traverse(
                    ng,
                    player_to_train,
                    regretModels,
                    playerRegret,
                    strategyModels,
                    strategyData,
                    metrics,
                    0,
                    True,
                    1,
                )
        # print(metrics)

    return player_to_train, playerRegret, strategyData, metrics
예제 #3
0
    def __init__(self, game: GameInterface, max_games: int, policy_networks, pool):
        super().__init__()
        self.on_game = 0
        self.game = game.clone()
        self.policy_networks = policy_networks
        self.max_games = max_games
        self.pool = pool
        self.results = []
        self.futures = []
        self.on_iter = 0

        # Spin up some games
        for _ in range(NUM_PARALLEL_GAMES):
            self.start_game()
def train(iterations: int, game: GameInterface, output_file: str):
    NUM_GAME_ITERATIONS = 1000

    model = MeanActorCritic(game.feature_dim(), game.action_dim())

    with Pool(os.cpu_count()) as gamePool:
        for iteration in range(iterations):
            print("ON ITERATION", iteration)
            if iteration > 0:
                policy = model.get_policy()
            else:
                policy = None

            with torch.no_grad():
                starts = []
                for player_to_train in range(game.num_players):
                    for game_iteration in range(NUM_GAME_ITERATIONS):
                        # print("Queueing Game", player_to_train, game_iteration)
                        new_game = game.clone()
                        new_game.reset()
                        starts.append((new_game, policy))

                if True:
                    results = gamePool.starmap(start_traverse, starts)
                else:
                    results = []
                    for start in starts:
                        results.append(start_traverse(*start))
                print("Finished playing games")

                metrics: Counter = Counter()
                for result in results:
                    (
                        player_to_train,
                        new_player_regret,
                        new_strategy_data,
                        new_metrics,
                    ) = result
                    playerRegrets[player_to_train].cat(new_player_regret)
                    strategyData.cat(new_strategy_data)
                    metrics.update(new_metrics)
                print(metrics)

            if (iteration + 1) % 1 == 0:
                stratModel = RegretMatching(game.feature_dim(),
                                            game.action_dim())
                features, active_labels, labels = strategyData.getFilled()
                stratModel.train_model(features, active_labels, labels,
                                       "strategy", None)
                bestStrategy = RegretMatchingForward(stratModel)
                # print(
                #     "Learned Strategy at " + str(iteration) + ": ", str(bestStrategy),
                # )
                # print("***")
                torch.save(bestStrategy, output_file)

                with torch.no_grad():
                    # Check winrate against random player
                    scoreCounter: Counter = Counter()
                    NUM_RANDOM_GAMES = 1000
                    num_decisions = 0
                    average_decision = torch.zeros((game.action_dim(), ),
                                                   dtype=torch.float)
                    for on_game in range(NUM_RANDOM_GAMES):
                        gameState = game.clone()
                        gameState.reset()
                        features = torch.zeros((1, gameState.feature_dim()),
                                               dtype=torch.float)
                        while (not gameState.terminal()
                               ):  # gameState.phase != GamePhase.GAME_OVER:
                            seatToAct = gameState.get_player_to_act()
                            if seatToAct == 0:
                                possible_action_mask = gameState.get_one_hot_actions(
                                    True)
                                gameState.populate_features(features[0])
                                action_probs = (
                                    bestStrategy(features).detach()[0].clamp(
                                        min=1e-6))
                            else:
                                possible_action_mask = gameState.get_one_hot_actions(
                                    True)
                                action_probs = possible_action_mask.float()
                            action_prob_dist = torch.distributions.Categorical(
                                action_probs * possible_action_mask)
                            if on_game == 0:
                                print("ACTION", action_prob_dist.probs)
                            action_index = int(
                                action_prob_dist.sample().item())
                            average_decision[action_index] += 1.0
                            num_decisions += 1
                            gameState.act(seatToAct, action_index)
                        payoffs = gameState.payoffs()
                        for i, p in enumerate(payoffs):
                            scoreCounter[str(i)] += p
                    print("DECISION HISTOGRAM")
                    print(average_decision / num_decisions)
                    print("SCORE AGAINST RANDOM")
                    for x in range(gameState.num_players):
                        print(x,
                              scoreCounter[str(x)] / float(NUM_RANDOM_GAMES))

    stratModel = RegretMatching(game.feature_dim(), game.action_dim())
    stratModel.train_model(*strategyData.getFilled())
    return stratModel
예제 #5
0
def traverse(
    game: GameInterface,
    player_to_train: int,
    regretModels: List[Optional[FullyConnectedForward]],
    playerRegret: ExpandableTensorSet,
    strategyModels: List[Optional[FullyConnectedForward]],
    strategyData: ExpandableTensorSet,
    metrics: Counter,
    level: int,
    first_pass: bool,
    branch_factor_estimate: float,
) -> torch.Tensor:
    if game.terminal():
        return game.payoffs()

    features = torch.zeros((game.feature_dim(), ), dtype=torch.float)
    game.populate_features(features)

    player_to_act = game.get_player_to_act()
    model = regretModels[player_to_act]
    possible_actions = game.get_one_hot_actions(True)
    num_choices = possible_actions.sum()
    branch_factor_estimate = float((branch_factor_estimate * level) +
                                   (num_choices)) / (level + 1.0)
    metrics.update({"possible_actions_" + str(possible_actions.sum()): 1})
    has_a_choice = num_choices > 1
    if model is None:
        strategy = possible_actions.float()
        active_sampling_chances = None
    else:
        assert strategyModels[player_to_act] is not None
        model_regrets = model.forward_cache(features.unsqueeze(0))[0]
        model_probs = model_regrets.clamp(min=1e-3) * possible_actions.float()
        strategy = model_probs
        active_sampling_chances = (
            strategyModels[player_to_act]  # type: ignore
            .forward(features.unsqueeze(0))[0].clamp(min=1e-3) *
            possible_actions.float())
        active_sampling_chances_sum = float(
            active_sampling_chances.sum().item())

    action_dist = torch.distributions.Categorical(strategy)
    if action_dist.probs.min() < 0 or action_dist.probs.max() == 0:
        print("Invalid action dist:", action_dist.probs)
    if strategy.min() < 0 or strategy.max() == 0:
        print("Invalid strategy:", strategy)

    chance_to_sample = (1.0 if level < 2 else 1.0 -
                        (1.0 / (100.0**(1.0 /
                                        ((level)**branch_factor_estimate)))))
    do_sample = random.random() < chance_to_sample

    if has_a_choice and first_pass:
        strategyData.append((
            features.unsqueeze(0),
            possible_actions.unsqueeze(0),
            action_dist.probs.unsqueeze(0),
        ))

    metrics.update({"visit_level_" + str(level): 1})
    metrics["visit"] += 1
    if metrics["visit"] % 100000 == 0:
        print("Visits", metrics["visit"])

    can_traverse = player_to_train == player_to_act
    if can_traverse and has_a_choice and do_sample:
        # print("PASSED",level,chance_to_sample)
        metrics.update({"sample_level_" + str(level): 1})
        metrics["sample"] += 1
        if metrics["sample_level_" + str(level)] % 10000 == 0:
            print(
                "Samples",
                metrics["sample"],
                metrics["sample_level_" + str(level)],
                level,
                chance_to_sample,
            )

        payoff_for_action = torch.zeros(
            (possible_actions.size()[0], game.num_players), dtype=torch.float)
        chosen_actions = torch.zeros_like(possible_actions)
        enum_actions = list(enumerate(possible_actions))
        random.shuffle(enum_actions)
        num_chosen = 0
        for i, a in enum_actions:
            if a == 0:
                continue
            g = game.clone()
            g.act(player_to_act, i)

            # Active sampling: https://papers.nips.cc/paper/4569-efficient-monte-carlo-counterfactual-regret-minimization-in-games-with-many-player-actions.pdf
            EPSILON = 0.05
            BONUS = 1e-6
            THRESHOLD = 1
            if active_sampling_chances is None:
                # Do Outcome sampling for the first iteration
                as_pass = num_chosen == 0
            else:
                as_pass = random.random() < float(
                    ((BONUS + THRESHOLD * active_sampling_chances[i]) /
                     (BONUS + active_sampling_chances_sum)).item())
            if level == 0:
                # Do external sampling for the game tree root
                as_pass = True
            if True or i == 0 or random.random() < EPSILON or as_pass:
                value = traverse(
                    g,
                    player_to_train,
                    regretModels,
                    playerRegret,
                    strategyModels,
                    strategyData,
                    metrics,
                    level + 1,
                    True if first_pass and num_chosen == 0 else False,
                    branch_factor_estimate,
                )
                payoff_for_action[i] = value
                chosen_actions[i] = 1.0
                num_chosen += 1
        weighted_action_dist = torch.distributions.Categorical(
            action_dist.probs * chosen_actions.float())
        assert payoff_for_action.size()[0] == weighted_action_dist.probs.size(
        )[0]
        expected_utility = payoff_for_action * weighted_action_dist.probs.unsqueeze(
            1)
        assert expected_utility.size() == payoff_for_action.size()
        expected_utility_over_all_actions = expected_utility.sum(dim=0)
        playerRegret.append((
            features.unsqueeze(0),
            chosen_actions.unsqueeze(0),
            (payoff_for_action[:, player_to_act] -
             expected_utility_over_all_actions[player_to_act]).unsqueeze(0),
        ))
        assert expected_utility_over_all_actions.size() == (
            game.num_players, ), str(expected_utility_over_all_actions.size())
        return expected_utility_over_all_actions
    else:
        game.act(player_to_act, int(action_dist.sample().item()))
        return traverse(
            game,
            player_to_train,
            regretModels,
            playerRegret,
            strategyModels,
            strategyData,
            metrics,
            level + int(can_traverse),
            True if first_pass else False,
            branch_factor_estimate,
        )
예제 #6
0
def traverse(
    game: GameInterface, policy_network, metrics: Counter, level: int,
) -> GameRollout:
    if game.terminal():
        gr = GameRollout(
            torch.zeros((level, game.feature_dim()), dtype=torch.float),  # States
            torch.zeros((level, 1), dtype=torch.long),  # Actions
            torch.zeros(
                (level, game.action_dim()), dtype=torch.long
            ),  # Possible Actions
            torch.zeros((level, 1), dtype=torch.long),  # Player to act
            game.payoffs().float().repeat((level, 1)),  # Payoffs
            torch.arange(level - 1, -1, -1, dtype=torch.float).unsqueeze(
                1
            ),  # Distance to payoff
            torch.zeros((level, game.action_dim()), dtype=torch.float),  # Policy
        )
        return gr

    features = torch.zeros((game.feature_dim(),), dtype=torch.float)
    game.populate_features(features)
    player_to_act = game.get_player_to_act()
    possible_actions = game.get_one_hot_actions(True)
    num_choices = possible_actions.sum()
    metrics.update({"possible_actions_" + str(possible_actions.sum()): 1})
    has_a_choice = num_choices > 1
    if policy_network is None or not has_a_choice:
        strategy = possible_actions.float()
        active_sampling_chances = None
    else:
        strategy = policy_network(features.unsqueeze(0), possible_actions.unsqueeze(0))[
            0
        ][0]
        assert (strategy * (1 - possible_actions)).sum() == 0

    action_dist = torch.distributions.Categorical(strategy)
    if action_dist.probs.min() < 0 or action_dist.probs.max() == 0:
        print("Invalid action dist:", action_dist.probs)
    if strategy.min() < 0 or strategy.max() == 0:
        print("Invalid strategy:", strategy)

    metrics.update({"visit_level_" + str(level): 1})
    metrics["visit"] += 1
    if metrics["visit"] % 100000 == 0:
        print("Visits", metrics["visit"])

    action_taken = int(action_dist.sample().item())
    game.act(player_to_act, action_taken)
    if has_a_choice:
        result = traverse(game, policy_network, metrics, level + 1,)
        payoff = result.payoffs[player_to_act]
        result.states[level] = features
        result.actions[level] = action_taken
        result.player_to_act[level] = player_to_act
        result.possible_actions[level] = possible_actions
        result.policy[level] = strategy
    else:
        # Don't advance the level, skip this non-choice
        result = traverse(game, policy_network, metrics, level,)
    return result
예제 #7
0
 def __init__(self, game: GameInterface, max_games: int, policy_networks):
     self.max_games = max_games
     self.game = game.clone()
     self.policy_networks = policy_networks
     self.pool = multiprocessing.Pool(NUM_PARALLEL_GAMES)