def __init__(self, game: GameInterface): super().__init__() self.game = game self.actor_critics = torch.nn.ModuleList([ ActorCritic(game.feature_dim(), game.action_dim()) for _ in range(NUM_PARALLEL_MODELS) ]) self.learning_rate = 0.01
def start_traverse( game: GameInterface, player_to_train: int, regretModels: List[Optional[RegretMatching]], strategyModels: List[Optional[RegretMatching]], ) -> Tuple[int, ExpandableTensorSet, ExpandableTensorSet, Counter]: lowpriority() NUM_INNER_GAME_ITERATIONS = 100 with torch.no_grad(): playerRegret = ExpandableTensorSet( 16 * 1024, (game.feature_dim(), game.action_dim(), game.action_dim())) strategyData = ExpandableTensorSet( 16 * 1024, (game.feature_dim(), game.action_dim(), game.action_dim())) metrics: Counter = Counter() for _ in range(NUM_INNER_GAME_ITERATIONS): ng = game.clone() ng.reset() with Profiler(False): traverse( ng, player_to_train, regretModels, playerRegret, strategyModels, strategyData, metrics, 0, True, 1, ) # print(metrics) return player_to_train, playerRegret, strategyData, metrics
def __init__(self, game: GameInterface, max_games: int, policy_networks, pool): super().__init__() self.on_game = 0 self.game = game.clone() self.policy_networks = policy_networks self.max_games = max_games self.pool = pool self.results = [] self.futures = [] self.on_iter = 0 # Spin up some games for _ in range(NUM_PARALLEL_GAMES): self.start_game()
def train(iterations: int, game: GameInterface, output_file: str): NUM_GAME_ITERATIONS = 1000 model = MeanActorCritic(game.feature_dim(), game.action_dim()) with Pool(os.cpu_count()) as gamePool: for iteration in range(iterations): print("ON ITERATION", iteration) if iteration > 0: policy = model.get_policy() else: policy = None with torch.no_grad(): starts = [] for player_to_train in range(game.num_players): for game_iteration in range(NUM_GAME_ITERATIONS): # print("Queueing Game", player_to_train, game_iteration) new_game = game.clone() new_game.reset() starts.append((new_game, policy)) if True: results = gamePool.starmap(start_traverse, starts) else: results = [] for start in starts: results.append(start_traverse(*start)) print("Finished playing games") metrics: Counter = Counter() for result in results: ( player_to_train, new_player_regret, new_strategy_data, new_metrics, ) = result playerRegrets[player_to_train].cat(new_player_regret) strategyData.cat(new_strategy_data) metrics.update(new_metrics) print(metrics) if (iteration + 1) % 1 == 0: stratModel = RegretMatching(game.feature_dim(), game.action_dim()) features, active_labels, labels = strategyData.getFilled() stratModel.train_model(features, active_labels, labels, "strategy", None) bestStrategy = RegretMatchingForward(stratModel) # print( # "Learned Strategy at " + str(iteration) + ": ", str(bestStrategy), # ) # print("***") torch.save(bestStrategy, output_file) with torch.no_grad(): # Check winrate against random player scoreCounter: Counter = Counter() NUM_RANDOM_GAMES = 1000 num_decisions = 0 average_decision = torch.zeros((game.action_dim(), ), dtype=torch.float) for on_game in range(NUM_RANDOM_GAMES): gameState = game.clone() gameState.reset() features = torch.zeros((1, gameState.feature_dim()), dtype=torch.float) while (not gameState.terminal() ): # gameState.phase != GamePhase.GAME_OVER: seatToAct = gameState.get_player_to_act() if seatToAct == 0: possible_action_mask = gameState.get_one_hot_actions( True) gameState.populate_features(features[0]) action_probs = ( bestStrategy(features).detach()[0].clamp( min=1e-6)) else: possible_action_mask = gameState.get_one_hot_actions( True) action_probs = possible_action_mask.float() action_prob_dist = torch.distributions.Categorical( action_probs * possible_action_mask) if on_game == 0: print("ACTION", action_prob_dist.probs) action_index = int( action_prob_dist.sample().item()) average_decision[action_index] += 1.0 num_decisions += 1 gameState.act(seatToAct, action_index) payoffs = gameState.payoffs() for i, p in enumerate(payoffs): scoreCounter[str(i)] += p print("DECISION HISTOGRAM") print(average_decision / num_decisions) print("SCORE AGAINST RANDOM") for x in range(gameState.num_players): print(x, scoreCounter[str(x)] / float(NUM_RANDOM_GAMES)) stratModel = RegretMatching(game.feature_dim(), game.action_dim()) stratModel.train_model(*strategyData.getFilled()) return stratModel
def traverse( game: GameInterface, player_to_train: int, regretModels: List[Optional[FullyConnectedForward]], playerRegret: ExpandableTensorSet, strategyModels: List[Optional[FullyConnectedForward]], strategyData: ExpandableTensorSet, metrics: Counter, level: int, first_pass: bool, branch_factor_estimate: float, ) -> torch.Tensor: if game.terminal(): return game.payoffs() features = torch.zeros((game.feature_dim(), ), dtype=torch.float) game.populate_features(features) player_to_act = game.get_player_to_act() model = regretModels[player_to_act] possible_actions = game.get_one_hot_actions(True) num_choices = possible_actions.sum() branch_factor_estimate = float((branch_factor_estimate * level) + (num_choices)) / (level + 1.0) metrics.update({"possible_actions_" + str(possible_actions.sum()): 1}) has_a_choice = num_choices > 1 if model is None: strategy = possible_actions.float() active_sampling_chances = None else: assert strategyModels[player_to_act] is not None model_regrets = model.forward_cache(features.unsqueeze(0))[0] model_probs = model_regrets.clamp(min=1e-3) * possible_actions.float() strategy = model_probs active_sampling_chances = ( strategyModels[player_to_act] # type: ignore .forward(features.unsqueeze(0))[0].clamp(min=1e-3) * possible_actions.float()) active_sampling_chances_sum = float( active_sampling_chances.sum().item()) action_dist = torch.distributions.Categorical(strategy) if action_dist.probs.min() < 0 or action_dist.probs.max() == 0: print("Invalid action dist:", action_dist.probs) if strategy.min() < 0 or strategy.max() == 0: print("Invalid strategy:", strategy) chance_to_sample = (1.0 if level < 2 else 1.0 - (1.0 / (100.0**(1.0 / ((level)**branch_factor_estimate))))) do_sample = random.random() < chance_to_sample if has_a_choice and first_pass: strategyData.append(( features.unsqueeze(0), possible_actions.unsqueeze(0), action_dist.probs.unsqueeze(0), )) metrics.update({"visit_level_" + str(level): 1}) metrics["visit"] += 1 if metrics["visit"] % 100000 == 0: print("Visits", metrics["visit"]) can_traverse = player_to_train == player_to_act if can_traverse and has_a_choice and do_sample: # print("PASSED",level,chance_to_sample) metrics.update({"sample_level_" + str(level): 1}) metrics["sample"] += 1 if metrics["sample_level_" + str(level)] % 10000 == 0: print( "Samples", metrics["sample"], metrics["sample_level_" + str(level)], level, chance_to_sample, ) payoff_for_action = torch.zeros( (possible_actions.size()[0], game.num_players), dtype=torch.float) chosen_actions = torch.zeros_like(possible_actions) enum_actions = list(enumerate(possible_actions)) random.shuffle(enum_actions) num_chosen = 0 for i, a in enum_actions: if a == 0: continue g = game.clone() g.act(player_to_act, i) # Active sampling: https://papers.nips.cc/paper/4569-efficient-monte-carlo-counterfactual-regret-minimization-in-games-with-many-player-actions.pdf EPSILON = 0.05 BONUS = 1e-6 THRESHOLD = 1 if active_sampling_chances is None: # Do Outcome sampling for the first iteration as_pass = num_chosen == 0 else: as_pass = random.random() < float( ((BONUS + THRESHOLD * active_sampling_chances[i]) / (BONUS + active_sampling_chances_sum)).item()) if level == 0: # Do external sampling for the game tree root as_pass = True if True or i == 0 or random.random() < EPSILON or as_pass: value = traverse( g, player_to_train, regretModels, playerRegret, strategyModels, strategyData, metrics, level + 1, True if first_pass and num_chosen == 0 else False, branch_factor_estimate, ) payoff_for_action[i] = value chosen_actions[i] = 1.0 num_chosen += 1 weighted_action_dist = torch.distributions.Categorical( action_dist.probs * chosen_actions.float()) assert payoff_for_action.size()[0] == weighted_action_dist.probs.size( )[0] expected_utility = payoff_for_action * weighted_action_dist.probs.unsqueeze( 1) assert expected_utility.size() == payoff_for_action.size() expected_utility_over_all_actions = expected_utility.sum(dim=0) playerRegret.append(( features.unsqueeze(0), chosen_actions.unsqueeze(0), (payoff_for_action[:, player_to_act] - expected_utility_over_all_actions[player_to_act]).unsqueeze(0), )) assert expected_utility_over_all_actions.size() == ( game.num_players, ), str(expected_utility_over_all_actions.size()) return expected_utility_over_all_actions else: game.act(player_to_act, int(action_dist.sample().item())) return traverse( game, player_to_train, regretModels, playerRegret, strategyModels, strategyData, metrics, level + int(can_traverse), True if first_pass else False, branch_factor_estimate, )
def traverse( game: GameInterface, policy_network, metrics: Counter, level: int, ) -> GameRollout: if game.terminal(): gr = GameRollout( torch.zeros((level, game.feature_dim()), dtype=torch.float), # States torch.zeros((level, 1), dtype=torch.long), # Actions torch.zeros( (level, game.action_dim()), dtype=torch.long ), # Possible Actions torch.zeros((level, 1), dtype=torch.long), # Player to act game.payoffs().float().repeat((level, 1)), # Payoffs torch.arange(level - 1, -1, -1, dtype=torch.float).unsqueeze( 1 ), # Distance to payoff torch.zeros((level, game.action_dim()), dtype=torch.float), # Policy ) return gr features = torch.zeros((game.feature_dim(),), dtype=torch.float) game.populate_features(features) player_to_act = game.get_player_to_act() possible_actions = game.get_one_hot_actions(True) num_choices = possible_actions.sum() metrics.update({"possible_actions_" + str(possible_actions.sum()): 1}) has_a_choice = num_choices > 1 if policy_network is None or not has_a_choice: strategy = possible_actions.float() active_sampling_chances = None else: strategy = policy_network(features.unsqueeze(0), possible_actions.unsqueeze(0))[ 0 ][0] assert (strategy * (1 - possible_actions)).sum() == 0 action_dist = torch.distributions.Categorical(strategy) if action_dist.probs.min() < 0 or action_dist.probs.max() == 0: print("Invalid action dist:", action_dist.probs) if strategy.min() < 0 or strategy.max() == 0: print("Invalid strategy:", strategy) metrics.update({"visit_level_" + str(level): 1}) metrics["visit"] += 1 if metrics["visit"] % 100000 == 0: print("Visits", metrics["visit"]) action_taken = int(action_dist.sample().item()) game.act(player_to_act, action_taken) if has_a_choice: result = traverse(game, policy_network, metrics, level + 1,) payoff = result.payoffs[player_to_act] result.states[level] = features result.actions[level] = action_taken result.player_to_act[level] = player_to_act result.possible_actions[level] = possible_actions result.policy[level] = strategy else: # Don't advance the level, skip this non-choice result = traverse(game, policy_network, metrics, level,) return result
def __init__(self, game: GameInterface, max_games: int, policy_networks): self.max_games = max_games self.game = game.clone() self.policy_networks = policy_networks self.pool = multiprocessing.Pool(NUM_PARALLEL_GAMES)