def test_learner_against_rando(n_games=10000): # see how learner performs against a player making random moves metrics = run_simulator( p1=Player(strategy="basic_q", learning=False, load_Q=True), p2=Player(strategy="random"), n_games=n_games, ) visualize_win_ratio(metrics, "Performance Over Time (testing)")
def __init__(self, environment, No=100, discount_factor=1): Player.__init__(self) self.env = environment self.No = No self.disc_factor = discount_factor self.V = np.zeros( [self.env.dealer_max_value + 1, self.env.agent_max_value + 1]) self.wins = 0.0 self.iterations = 0.0
def train_learner_against_rando(n_games=20000): # load existing Q strategy and trains more against random strategy metrics = run_simulator( p1=Player(strategy="basic_q", learning=True, load_Q=True), p2=Player(strategy="random"), save_Q=True, n_games=n_games, ) visualize_win_ratio(metrics, "Performance Over Time (testing)")
def test_learner_against_self(): # see how the leaner performs against itself; results should be even metrics = run_simulator( p1=Player( strategy="basic_q", learning=False, load_Q=True, ), p2=Player(strategy="basic_q", learning=False, load_Q=True), ) visualize_win_ratio(metrics, "Performance Over Time (testing)")
def run_simulator( n_games=1000, p1=Player(strategy="basic_q", learning=True), p2=Player(strategy="random", learning=False), save_Q=False, ): """Runs a number of tic tac toe games Arguments: n_games: number of tic tac toe games to be played p1: instance of the Player() class, should be used to specify the AI during training or testing p2: instance of the Player() class, should be used to specify the opponent save_Q: boolean, determines if p1's updated Q should be saved in a pickled file after training is complete Returns: metrics: dict of P1's wins, losses, and ties Use this function to run many games in a row. Depending on the parameters for P1 and P2, this could be used for training or testing. This function also specifies the epsilon (exploration factor) decay over time as the learner moves from high exploration to low. """ games = [Game(p1=p1, p2=p2) for i in range(n_games)] metrics = [] index = 0 starting_epsilon = p1._epsilon for game in games: index += 1 # run game and get results + P1's states and actions outcome, x_decisions, o_decisions = game.play_game() # log game results metrics.append(outcome) # update q learner after each game if p1._learning: p1.update_q(outcome, x_decisions) # reduce exploration factor after each game p1._epsilon = starting_epsilon - starting_epsilon * (1. * index / n_games)**2 print Counter(metrics) # save pickled Q learning file if save_Q: pickle.dump(p1._Q, open(p1.q_file, "wb")) return metrics
def train_learner_against_self(n_sessions=5, games_per_session=10000): """Train a learner against itself Loads Q file for both P1 and P2. Learner (P1) starts with a high exploration factor that decays over time, while P2 uses no exploration factor. P1 updates its Q file over the course of *games_per_session*. After *games_per_session* have been played, P2 then updates its Q strategy to match P1's again, and the process continues iteratively for *n_sessions* """ for n in range(n_sessions): run_simulator( n_games=games_per_session, p1=Player(strategy="basic_q", learning=True, load_Q=True), p2=Player(strategy="basic_q", learning=False, load_Q=True), save_Q=True, ) print "%d training sessions completed out of %d" % (n + 1, n_sessions)
def SaveGame(self, request, context): game_history = GameHistory() game_history.observations = [tf.make_ndarray(observation) for observation in request.observations] game_history.actions = [Action(index) for index in request.actions] game_history.rewards = request.rewards game_history.to_plays = [Player(player_id) for player_id in request.to_plays] game_history.root_values = request.root_values game_history.policies = [policy.probabilities for policy in request.policies] self.replay_buffer.save_history(game_history) print('Number of games in buffer: {}'.format(len(self.replay_buffer.buffer))) return replay_buffer_pb2.SaveGameResponse(success=True)
def run_mcts(self, root, num_moves): min_max_stats = MinMaxStats(self.config.known_bounds) for _ in range(self.config.num_simulations): # root.print() action, leaf, cur_moves = self.select_leaf(root, num_moves, min_max_stats) to_play = Player(cur_moves % self.config.game_config.num_players) batch_hidden_state = tf.expand_dims(leaf.parent.hidden_state, axis=0) network_output = self.network.recurrent_inference(batch_hidden_state, [action]).split_batch()[0] self.expand_node(node=leaf, to_play=to_play, actions=self.config.game_config.action_space, network_output=network_output) self.backpropagate(leaf, network_output.value, to_play, min_max_stats)
def human_vs_human(): Game(p1=Player(strategy="human"), p2=Player(strategy="human"), verbose=True).play_game()
def adversarial_training( p1=Player(strategy="basic_q", learning=False, load_Q=True), p2=Player(strategy="random", learning=False), p3=Player(strategy="basic_q", learning=True, load_Q=True), p4=Player(strategy="adversarial", learning=False), ): """Finds strategies that beat the AI for targeted training Arguments: p1: trained Q-learner in "test mode" (no learning) p2: cpu player choosing random moves p3: trained Q-leaner that will continue to train p4: p3's opponent, uses saved strategy uncovered by p2 that beat p1 Plays a trained AI against a random player until (if) the random player wins. If the random player does win, it will save that strategy in its own Q file. It will then play this adversarial scenario many times so that the AI can learn a better strategy. Training is very specific and deep (many repetitions), so this is not meant to be a general training strategy and is best used after the AI is already sufficiently robust. Each time this function runs will only cover one adversarial example, so it may need to be run many times. """ # phase 1: AI vs. random opponent max_games = 50000 games = [Game(p1=p1, p2=p2) for i in range(max_games)] for game in games: # run game and get results + P1's states and actions outcome, x_decisions, o_decisions = game.play_game() # build a Q network for player O only where X lost if outcome == "lost": # phase 2: adversarial training print "AI lost. Playing adversarial games..." Q_adversarial = {board: action for board, action in o_decisions} num_games = 10000 a_games = [Game( p1=p3, p2=p4, ) for i in range(num_games)] starting_epsilon = p3._epsilon a_index = 0 a_metrics = [] for game in a_games: p4._Q = Q_adversarial a_index += 1 outcome, x_decisions, o_decisions = game.play_game() a_metrics.append(outcome) p3.update_q(outcome, x_decisions) p3._epsilon = starting_epsilon - starting_epsilon * ( 1. * a_index / num_games)**2 print "Adversarial game outcomes:\n" print Counter(a_metrics) print "Building better, stronger Q..." pickle.dump(p3._Q, open(p3.q_file, "wb")) # end training return print "played %d games without losing!" % max_games
def test_learner_against_human(): # tests Q-learner against human player ("O") p1 = Player(strategy="basic_q", load_Q=True) p2 = Player(strategy="human") game = Game(p1=p1, p2=p2, verbose=True) game.play_game()