def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: vectorized_states = np.array([information_state.vectorize()] * len(available_actions)) actions_vectorized = np.array([ to_categorical(action, self.action_size) for action in available_actions ]) logits = self.brain.predict_policies(vectorized_states, actions_vectorized) sum = np.sum(logits) probabilities = np.reshape(logits / sum, (len(available_actions), )) chosen_action = np.random.choice(available_actions, p=probabilities) transition = dict() transition['s'] = information_state.vectorize() transition['a'] = to_categorical(chosen_action, self.action_size) transition['r'] = 0.0 transition['t'] = False self.current_trajectory_buffer.append(transition) return chosen_action
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: if self.current_transition: self.current_transition['terminal'] = False self.current_trajectory.append(self.current_transition) self.current_transition = None for i in range(self.iteration_count): self.current_iteration_selected_nodes = {} gs = information_state.create_game_state_from_information_state() # SELECT gs, info_state, current_player, terminal = self.select(gs) if not terminal: # EXPAND node = self.current_trees[current_player][info_state] available_actions = gs.get_available_actions_id_for_player( current_player) node['a'] = [{ 'n': 0, 'r': 0, 'action_id': action_id } for action_id in available_actions] child_action = random.choice(node['a']) action_to_execute = child_action['action_id'] self.add_visited_node(node, child_action, current_player) gs, reward, terminal = gs.step(current_player, action_to_execute) # EVALUATE scores = self.brain.predict_state(info_state.vectorize()) # BACKPROPAGATE SCORE for player_id in self.current_iteration_selected_nodes.keys(): visited_nodes = self.current_iteration_selected_nodes[ player_id] for node, child_action in reversed(visited_nodes): node['nprime'] += 1 child_action['n'] += 1 child_action['r'] += scores[player_id] child_action = max( self.current_iteration_selected_nodes[player_index][0][0]['a'], key=lambda child: child['n']) self.current_transition = { 's': information_state.vectorize(), 'r': 0, 'player_index': player_index, 'terminal': False } return child_action['action_id']
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: available_actions_list = list(available_actions) inputs_states = np.array([information_state.vectorize()] * len(available_actions_list)) actions_vectorized = np.array([ keras.utils.to_categorical(action_id, self.action_size) for action_id in available_actions_list ]) if self.s is not None: self.s_next_duplicated = inputs_states self.s_next_available_actions = actions_vectorized self.t = False self.learn() if random.random() > self.epsilon: q_values = self.Q.predict([inputs_states, actions_vectorized]).flatten() best_id = q_values.argmax() else: best_id = random.randint(0, len(available_actions_list) - 1) self.s = inputs_states[best_id] self.a = actions_vectorized[best_id] return available_actions_list[best_id]
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: available_actions = list(available_actions) num_actions = len(available_actions) vectorized_state = information_state.vectorize() full_actions_probability, value = self.brain.predict_policy_and_value(vectorized_state) available_actions_probabilities = full_actions_probability[available_actions] sum_available_action_probabilities = np.sum(available_actions_probabilities) if sum_available_action_probabilities > 0.0000001: # just in case all is zero, but unlikely probabilities = available_actions_probabilities / sum_available_action_probabilities chosen_index = np.random.choice(list(range(num_actions)), p=probabilities) chosen_action = available_actions[chosen_index] else: print("No action eligible, this should be extremely rare") chosen_index = np.random.choice(list(range(num_actions))) chosen_action = available_actions[chosen_index] transition = dict() transition['s'] = vectorized_state transition['a'] = chosen_action transition['r'] = 0.0 transition['t'] = False transition['p_old'] = full_actions_probability.tolist() self.current_trajectory_buffer.append(transition) return chosen_action
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: actions = tuple(available_actions) action_count = len(actions) action_scores = np.zeros(action_count) for i in range(action_count): gs = information_state.create_game_state_from_information_state() (result_gs, score, terminal) = gs.step(player_index, actions[i]) # Two player zero sum game hypothesis player_score = (1 if player_index == 0 else -1) * score if not terminal: history = self.runner.run(self.num_rollouts_per_available_action, gs) player_score += history[player_index] - history[(player_index + 1) % 2] player_score = player_score / (1.0 if terminal else self.num_rollouts_per_available_action) action_scores[i] = player_score return actions[np.argmax(action_scores)]
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: for i in range(self.iteration_count): self.current_iteration_selected_nodes = {} gs = information_state.create_game_state_from_information_state() # SELECT gs, info_state, current_player, terminal = self.select(gs) if not terminal: # EXPAND node = self.current_trees[current_player][info_state] available_actions = gs.get_available_actions_id_for_player( current_player) node['a'] = [{ 'n': 0, 'r': 0, 'action_id': action_id } for action_id in available_actions] child_action = random.choice(node['a']) action_to_execute = child_action['action_id'] self.add_visited_node(node, child_action, current_player) gs, reward, terminal = gs.step(current_player, action_to_execute) # EVALUATE scores = self.runner.run(initial_game_state=gs, max_rounds=1) # BACKPROPAGATE SCORE for player_id in self.current_iteration_selected_nodes.keys(): visited_nodes = self.current_iteration_selected_nodes[ player_id] for node, child_action in reversed(visited_nodes): node['nprime'] += 1 child_action['n'] += 1 child_action['r'] += scores[player_id] child_action = max( self.current_iteration_selected_nodes[player_index][0][0]['a'], key=lambda child: child['n']) return child_action['action_id']
def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int: available_actions = list(available_actions) if self.evaluation_episodes + self.training_episodes < self.current_episode: self.X = [] self.Y = [] self.current_episode = 0 if self.current_episode > self.training_episodes: probs = self.model.predict( np.array([information_state.vectorize()]))[0] available_probs = probs[np.array(available_actions)] probs_sum = np.sum(available_probs) if (probs_sum > 0.001): chosen_action_index = np.argmax(available_probs) action = available_actions[chosen_action_index] else: action = random.choice(available_actions) return action for i in range(self.iteration_count): self.current_iteration_selected_nodes = {} gs = information_state.create_game_state_from_information_state() # SELECT gs, info_state, current_player, terminal = self.select(gs) if not terminal: # EXPAND node = self.current_trees[current_player][info_state] available_actions = gs.get_available_actions_id_for_player( current_player) node['a'] = [{ 'n': 0, 'r': 0, 'action_id': action_id } for action_id in available_actions] child_action = random.choice(node['a']) action_to_execute = child_action['action_id'] self.add_visited_node(node, child_action, current_player) gs, reward, terminal = gs.step(current_player, action_to_execute) # EVALUATE scores = self.runner.run(initial_game_state=gs, max_rounds=1) # BACKPROPAGATE SCORE for player_id in self.current_iteration_selected_nodes.keys(): visited_nodes = self.current_iteration_selected_nodes[ player_id] for node, child_action in reversed(visited_nodes): node['nprime'] += 1 child_action['n'] += 1 child_action['r'] += scores[player_id] child_action = max( self.current_iteration_selected_nodes[player_index][0][0]['a'], key=lambda child: child['n']) self.X.append(information_state.vectorize().tolist()) self.Y.append( to_categorical(child_action['action_id'], self.action_size)) return child_action['action_id']