def simulation(self, node): """ Leaf Evaluation - Estimating the value of a leaf node in the tree by doing a roll-out simulation using the default policy from the leaf node’s state to a final state. :return: int - The player who won the simulated game """ current_node = node children = self.state_manager.get_child_nodes(current_node.state) player = node.player while len(children) != 0: # Use the default policy (random) to select a child current_node = random.choice(children) player = get_next_player(player) children = self.state_manager.get_child_nodes(current_node.state) winner = get_next_player( player) # Winner was actually the prev player who made a move return int(winner == 1)
def simulate(self): """ Run G consecutive games (aka. episodes) of the self.game_type using fixed values for the game parameters: N and K for NIM, B_init for Ledge. When the G games have finished, your simulator must summarize the win-loss statistics. A typical summary (for G = 50) would be a simple statement such as: Player 1 wins 40 of 50 games (80%). """ wins = 0 # Number of times player 1 wins # Actual games being played for episode in range(1, self.episodes + 1): logging.info("Episode: {}".format(episode)) # The actual game being played this episode game = get_new_game(self.game_type, self.game_config, verbose=self.verbose) # For each game, a new Monte Carlo Search Tree is made mcts = MonteCarloSearchTree(self.game_type, self.game_config) state, player = game.get_current_state(), self.get_start_player() mcts.set_root(Node(state, None, player=player)) # While the actual game is not finished while not game.is_winning_state(): # Every time we shall select a new action, we perform M number of simulations in MCTS for _ in range(self.num_sim): # One iteration of Monte Carlo Tree Search consists of four steps # 1. Selection leaf = mcts.selection() # 2. Expand selected leaf node sim_node = mcts.expansion(leaf) # 3. Simulation z = mcts.simulation(sim_node) # 4. Backward propagation mcts.backward(sim_node, z) # Now use the search tree to choose next action new_root = mcts.select_actual_action(player) # Perform this action, moving the game from state s -> s´ game.perform_action(player, new_root.action) # Update player player = get_next_player(player) # Set new root of MCST mcts.set_root(new_root) # If next player is 2 and we are in a win state, player 1 got us in a win state if player == 2: wins += 1 # Report statistics logging.info("Player1 wins {} of {} games ({}%)".format( wins, self.episodes, round(100 * (wins / self.episodes))))
def play_game(self, p1, p2): """ Play one game and return the winner :param p1: Actor - player 1 :param p2: Actor - player 2 :return: int - winner """ actors = {1: p1, 2: p2} self.state_manager.init_new_game() player = random.randint(1, 2) # Choose random player to start action_log = [] while not self.state_manager.is_winning_state(): current_state = self.state_manager.get_current_state() action_index = actors[player].topp_policy(player, current_state) action = self.state_manager.get_action(player, action_index) self.state_manager.perform_actual_action(action) player = get_next_player(player) action_log.append(action) winner = get_next_player(player) return actors[winner].name, action_log
def simulation(self, node): """ Leaf Evaluation - Estimating the value of a leaf node in the tree by doing a roll-out simulation using the default policy from the leaf node’s state to a final state. :return: int - The player who won the simulated game """ current_state, player = node.state, node.player # Use Critic with a probability X if random.random() > self.actor.epsilon_critic: reward = self.actor.value_function(player, current_state) # If not, simulate to end of game else: while not self.state_manager.verify_winning_state(current_state): # Get next action using the default policy action_index = self.actor.default_policy(player, current_state) current_state = self.state_manager.get_next_state( player, current_state, action_index) player = get_next_player(player) winner = get_next_player( player) # Winner was actually the prev player who made a move reward = int(winner == 1) return reward
def expansion(self, leaf): """ Node Expansion - Generating some or all child states of a parent state, and then connecting the tree node housing the parent state (a.k.a. parent node) to the nodes housing the child states (a.k.a. child nodes). :return: """ # Get all legal child states from leaf state leaf.children = self.state_manager.get_child_nodes(leaf.state) # Set leaf as their parent node child_player = get_next_player(leaf.player) for child in leaf.children: child.player = child_player child.parent = leaf # Tree is now expanded, return the leaf, and simulate to game over return leaf
def simulate(self): """ Run G consecutive games (aka. episodes) of the self.game_type using fixed values for the game parameters """ save_interval = int(self.episodes / (self.save_interval - 1)) # Save interval for ANET visualizer = Visualizer( self.game_config) # Visualizer that visualize games actor = Actor(self.anet_config) # Initialize Actor which have ANET rbuf = ReplayBuffer() # Buffer for saving training data (node, D) game = StateManager( self.game_config ) # Init a StateManager that takes care of the actual game wins = 0 # Number of times player 1 wins # Actual games being played for episode in range(1, self.episodes + 1): logging.info("Episode: {}".format(episode)) # Initialize the actual game game.init_new_game() action_log = [] # Initialize the MonteCarloSearchTree to a single node with the initialized game state state, player = game.get_current_state(), self.get_start_player() mcts = MonteCarloSearchTree(actor, self.game_config, c=self.mcts_config["c"]) mcts.set_root(Node(state, None, player=player)) # While the actual game is not finished while not game.is_winning_state(): # Every time we shall select a new action, we perform M number of simulations in MCTS for _ in range(self.num_sim): # One iteration of Monte Carlo Tree Search consists of four steps leaf = mcts.selection() sim_node = mcts.expansion(leaf) z = mcts.simulation(sim_node) mcts.backward(sim_node, z) # Get the probability distribution over actions from current root/state. D = mcts.get_root_distribution() # Modifying D for obvious wins or other heuristics D = game.apply_heuristics(mcts.root, D) # Add training data to ReplayBuffers (node, D, reward) rbuf.add_case((mcts.root, D, mcts.root.value)) # Select actual move based on D new_root = mcts.select_actual_action(player) action_log.append(new_root.action) # Perform this action, moving the game from state s -> s´ game.perform_actual_action(new_root.action) # Update player player = get_next_player(player) # Set new root of MCST mcts.set_root(new_root) # End of episode visualizer.add_game_log(action_log) # Update epsilon for next round of simulations actor.update_epsilon() # Train ANET and CNET on a random mini-batch of cases from ReplayBuffer actor.train(rbuf.get_batch(self.batch_size)) # Save ANET if episode % save_interval == 0 or episode == 1: path = "./pretrained/ANET_E{}.pth".format(episode) logging.info("Saving model to file {}".format(path)) torch.save(actor.anet.state_dict(), path) # Save visualization of last game if self.visualize and episode % self.visualize_interval == 0: visualizer.animate_latest_game() # If next player is 2 and we are in a win state, player 1 got us in a win state if player == 2: wins += 1 actor.visualize_loss() logging.info("Player1 wins {} of {} games ({}%)".format( wins, self.episodes, round(100 * (wins / self.episodes))))