def __init__(self, action_n, state, checkpoint_idx, parent, tree, prior_prob=None, is_head=False): self.state = state self.action_n = action_n self.checkpoint_idx = checkpoint_idx self.parent = parent self.tree = tree self.is_head = is_head if tree is not None: self.max_width = tree.max_width else: self.max_width = 0 self.children = [None for _ in range(self.action_n)] self.rewards = [0.0 for _ in range(self.action_n)] self.dones = [False for _ in range(self.action_n)] self.children_visit_count = [0 for _ in range(self.action_n)] self.children_completed_visit_count = [0 for _ in range(self.action_n)] self.Q_values = [0.0 for _ in range(self.action_n)] self.visit_count = 0 if prior_prob is not None: self.prior_prob = prior_prob else: self.prior_prob = np.ones([self.action_n], dtype=np.float32) / self.action_n # Record traverse history self.traverse_history = dict() # Visited node count self.visited_node_count = 0 # Updated node count self.updated_node_count = 0 # Moving average calculator self.moving_aveg_calculator = MovingAvegCalculator(window_length=500)
class WU_UCTnode(): def __init__(self, action_n, state, checkpoint_idx, parent, tree, prior_prob=None, is_head=False): self.state = state self.action_n = action_n self.checkpoint_idx = checkpoint_idx self.parent = parent self.tree = tree self.is_head = is_head if tree is not None: self.max_width = tree.max_width else: self.max_width = 0 self.children = [None for _ in range(self.action_n)] self.rewards = [0.0 for _ in range(self.action_n)] self.dones = [False for _ in range(self.action_n)] self.children_visit_count = [0 for _ in range(self.action_n)] self.children_completed_visit_count = [0 for _ in range(self.action_n)] self.Q_values = [0.0 for _ in range(self.action_n)] self.visit_count = 0 if prior_prob is not None: self.prior_prob = prior_prob else: self.prior_prob = np.ones([self.action_n], dtype=np.float32) / self.action_n # Record traverse history self.traverse_history = dict() # Visited node count self.visited_node_count = 0 # Updated node count self.updated_node_count = 0 # Moving average calculator self.moving_aveg_calculator = MovingAvegCalculator(window_length=500) def no_child_available(self): # All child nodes have not been expanded. return self.updated_node_count == 0 def all_child_visited(self): # All child nodes have been visited (not necessarily updated). if self.is_head: return self.visited_node_count == self.action_n else: return self.visited_node_count == self.max_width def all_child_updated(self): # All child nodes have been updated. if self.is_head: return self.updated_node_count == self.action_n else: return self.updated_node_count == self.max_width def isTerminal(self): return self.state.is_over() # Shallowly clone itself, contains necessary data only. def shallow_clone(self): node = WU_UCTnode(action_n=self.action_n, state=deepcopy(self.state), checkpoint_idx=self.checkpoint_idx, parent=None, tree=None, prior_prob=None, is_head=False) for action in range(self.action_n): if self.children[action] is not None: node.children[action] = 1 node.children_visit_count = deepcopy(self.children_visit_count) node.children_completed_visit_count = deepcopy( self.children_completed_visit_count) node.visited_node_count = self.visited_node_count node.updated_node_count = self.updated_node_count node.action_n = self.action_n node.max_width = self.max_width node.prior_prob = self.prior_prob.copy() return node # Select action according to the P-UCT tree policy def select_action(self): best_score = -10000.0 best_action = 0 for action in range(self.action_n): if self.children[action] is None: continue exploit_score = self.Q_values[ action] / self.children_completed_visit_count[action] explore_score = math.sqrt(2.0 * math.log(self.visit_count) / self.children_visit_count[action]) score_std = self.moving_aveg_calculator.get_standard_deviation() score = exploit_score + score_std * 2.0 * explore_score if score > best_score: best_score = score best_action = action return best_action # Return the action with maximum utility. def max_utility_action(self): best_score = -10000.0 best_action = 0 for action in range(self.action_n): if self.children[action] is None: continue score = self.Q_values[ action] / self.children_completed_visit_count[action] if score > best_score: best_score = score best_action = action return best_action # Choose an action to expand def select_expand_action(self): # print(self.state.get_legal_actions(), '--------------------------------') return np.random.choice(self.state.get_legal_actions()) # count = 0 # while True: # if count < 20: # action = self.categorical(self.prior_prob) # else: # action = np.random.choice(self.state.get_legal_actions()) # if count > 100: # return action # if self.children_visit_count[action] > 0 and count < 10: # count += 1 # continue # if self.children[action] is None: # return action # count += 1 # Update traverse history, used to perform update def update_history(self, idx, action_taken): if idx in self.traverse_history: return False else: self.traverse_history[idx] = (action_taken) return True # Incomplete update, called by WU_UCT.py def update_incomplete(self, idx): action_taken = self.traverse_history[idx] # print(Action(), 'a;sldkjfa;lskdjfals;dfjkl;adfsjkladfsjkladfsjkadfsjklafsdjkl;sfdjkl;dfsjkl;afsdjkl;') action_taken = [ Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_HALF_POT, Action.RAISE_POT, Action.ALL_IN ].index(action_taken) if self.children_visit_count[action_taken] == 0: self.visited_node_count += 1 self.children_visit_count[action_taken] += 1 self.visit_count += 1 # Complete update, called by WU_UCT.py def update_complete(self, idx, accu_reward): if idx not in self.traverse_history: raise RuntimeError( "idx {} should be in traverse_history".format(idx)) else: item = self.traverse_history.pop(idx) action_taken = item action_taken = [ Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_HALF_POT, Action.RAISE_POT, Action.ALL_IN ].index(action_taken) # accu_reward = reward + self.tree.gamma * accu_reward if self.children_completed_visit_count[action_taken] == 0: self.updated_node_count += 1 self.children_completed_visit_count[action_taken] += 1 self.Q_values[action_taken] += accu_reward self.moving_aveg_calculator.add_number(accu_reward) return accu_reward # Add a child to current node. def add_child(self, action, child_state, checkpoint_idx, prior_prob=None): action = [ Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_HALF_POT, Action.RAISE_POT, Action.ALL_IN ].index(action) if self.children[action] is not None: node = self.children[action] else: node = WU_UCTnode(action_n=self.action_n, state=child_state, checkpoint_idx=checkpoint_idx, parent=self, tree=self.tree, prior_prob=prior_prob) self.children[action] = node return node # Draw a sample from the categorical distribution parametrized by 'pvals'. @staticmethod def categorical(pvals): num = np.random.random() for i in range(pvals.size): if num < pvals[i]: return i else: num -= pvals[i] return pvals.size - 1
class UCTnode(): def __init__(self, action_n, state, checkpoint_idx, parent, tree, prior_prob=None, is_head=False, allowed_actions=None): self.action_n = action_n self.state = state self.checkpoint_idx = checkpoint_idx self.parent = parent self.tree = tree self.is_head = is_head self.allowed_actions = allowed_actions if tree is not None: self.max_width = tree.max_width else: self.max_width = 0 self.children = [None for _ in range(self.action_n)] self.rewards = [0.0 for _ in range(self.action_n)] self.dones = [False for _ in range(self.action_n)] self.children_visit_count = [0 for _ in range(self.action_n)] self.Q_values = [0 for _ in range(self.action_n)] self.visit_count = 0 if prior_prob is not None: self.prior_prob = prior_prob else: self.prior_prob = np.ones([self.action_n], dtype=np.float32) / self.action_n # Record traverse history self.traverse_history = list() # Updated node count self.updated_node_count = 0 # Moving average calculator self.moving_aveg_calculator = MovingAvegCalculator(window_length=500) def no_child_available(self): # All child nodes have not been expanded. return self.updated_node_count == 0 def all_child_visited(self): # All child nodes have been visited and updated. if self.is_head: if self.allowed_actions is None: return self.updated_node_count == self.action_n else: return self.updated_node_count == len(self.allowed_actions) else: return self.updated_node_count == self.max_width def select_action(self): best_score = -10000.0 best_action = 0 for action in range(self.action_n): if self.children[action] is None: continue if self.allowed_actions is not None and action not in self.allowed_actions: continue exploit_score = self.Q_values[action] / self.children_visit_count[ action] explore_score = math.sqrt(1.0 * math.log(self.visit_count) / self.children_visit_count[action]) score_std = self.moving_aveg_calculator.get_standard_deviation() score = exploit_score + score_std * explore_score if score > best_score: best_score = score best_action = action return best_action def max_utility_action(self): best_score = -10000.0 best_action = 0 for action in range(self.action_n): if self.children[action] is None: continue score = self.Q_values[action] / self.children_visit_count[action] if score > best_score: best_score = score best_action = action return best_action def select_expand_action(self): count = 0 while True: if self.allowed_actions is None: if count < 20: action = self.categorical(self.prior_prob) else: action = np.random.randint(0, self.action_n) else: action = random.choice(self.allowed_actions) if count > 100: return action if self.children_visit_count[action] > 0 and count < 10: count += 1 continue if self.children[action] is None: return action count += 1 def update_history(self, action_taken, reward): self.traverse_history = (action_taken, reward) def update(self, accu_reward): action_taken = self.traverse_history[0] reward = self.traverse_history[1] accu_reward = reward + self.tree.gamma * accu_reward if self.children_visit_count[action_taken] == 0: self.updated_node_count += 1 self.children_visit_count[action_taken] += 1 self.Q_values[action_taken] += accu_reward self.visit_count += 1 self.moving_aveg_calculator.add_number(accu_reward) return accu_reward def add_child(self, action, child_state, checkpoint_idx, prior_prob): if self.children[action] is not None: node = self.children[action] else: node = UCTnode(action_n=self.action_n, state=child_state, checkpoint_idx=checkpoint_idx, parent=self, tree=self.tree, prior_prob=prior_prob) self.children[action] = node return node @staticmethod def categorical(pvals): num = np.random.random() for i in range(pvals.size): if num < pvals[i]: return i else: num -= pvals[i] return pvals.size - 1