def simulate(self, history): if util.is_terminal(history): return self.handle_terminal_state(history) player = util.player(history) if self.out_of_tree[player]: return self.rollout(history) player_history = util.information_function(history, player) player_tree = get_tree(player) if player_history in player_tree and player_tree[player_history].children: action = self.select(history) else: expand(player_one_tree, history, 1) expand(player_two_tree, history, -1) action = random.choice(util.get_available_actions(history)) if player != 0: self.out_of_tree[1] = True self.out_of_tree[-1] = True new_history = history + action running_reward = evaluator.calculate_reward_full_info(history) + self.discount_factor * self.simulate(new_history) update_player_tree(history, action, 1, running_reward) update_player_tree(history, action, -1, running_reward) return running_reward
def apply_mcts_strategy_from_tree(tree, full_tree, best_response_tree, current_history, terminals): if current_history not in full_tree: return best_response_tree best_response_tree[current_history] = potree.PoNode() children = full_tree[current_history].children if util.is_terminal(current_history): terminals.append(current_history) if util.player(current_history) == 1: player_history = util.information_function(current_history, 1) best_child = util.get_best_child(tree, player_history) if best_child is not None: action = best_child.replace(player_history, "") children = [current_history + action] best_response_tree[current_history].children = { current_history + action } else: children = [] else: best_response_tree[current_history].children = set(children) for history in children: apply_mcts_strategy_from_tree(tree, full_tree, best_response_tree, history, terminals) return best_response_tree
def apply_mcts_strategy_from_deterministic_strategy(strategy, full_tree, best_response_tree, current_history, terminals): if current_history not in full_tree: return best_response_tree best_response_tree[current_history] = potree.PoNode() children = full_tree[current_history].children if util.is_terminal(current_history): terminals.append(current_history) if util.player(current_history) == 1: player_history = util.information_function(current_history, 1) if player_history in strategy: child = current_history + strategy[player_history] best_response_tree[current_history].children = [ current_history + strategy[player_history] ] children = [child] else: children = [] else: best_response_tree[current_history].children = set(children) for history in children: apply_mcts_strategy_from_deterministic_strategy( strategy, full_tree, best_response_tree, history, terminals) return best_response_tree
def get_best_action_ucb(self, history, player, tree): player_history = util.information_function(history, player) best_value = float('-inf') best_action = None for action in util.get_available_actions(history): node_val = self.calculate_next_node_value(tree, player_history, action, player) if node_val > best_value: best_action = action best_value = node_val return best_action
def expand(tree, history, player): player_history = util.information_function(history, player) if player_history not in tree: tree[player_history] = potree.PoNode() for action in util.get_available_actions(player_history, player=player): new_history = player_history + action if new_history not in tree: tree[new_history] = potree.PoNode() tree[player_history].children.add(new_history)
def select(self, history): player = util.player(history) player_history = util.information_function(history, player) if player in {-1, 1}: tree = get_tree(player) eta_sub_expression = math.pow(1 + (.1 * math.sqrt(tree[player_history].visitation_count)), -1) eta = max((GAMMA, .9 * eta_sub_expression)) z = random.uniform(0, 1) if z < eta: return self.get_best_action_ucb(history, player, tree) else: return self.get_best_action_avg_strategy(player_history, tree) else: return random.choice(util.get_available_actions(history))
def get_action(self, history): player_history = util.information_function(history, 1) if isinstance(self.strategy[player_history], dict): candidates = [] probabilities = [] for key, value in self.strategy[player_history].items(): candidates.append(key) probabilities.append(value) action_choice = choice(candidates, p=probabilities) return action_choice else: return self.strategy[player_history]
def update_player_tree(history, action, player, reward): player_history = util.information_function(history, player) new_player_history = util.information_function(str(history + action), player) update(get_tree(player), player_history, new_player_history, reward)