def evaluate_state(self, state: State, list_of_actions: List[Action] = None) -> float: inversed_state = StateAsDict(state).to_state() inversed_state.change_active_player() return self.value_function.evaluate( state) - self.value_function.evaluate(inversed_state)
def choose_act(self, mode, info=False): current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) list_of_actions = self.env.action_space.list_of_actions if list_of_actions: best_action = None best_action_value = -100 for action in list_of_actions: state_copy = current_state_as_dict.to_state() action.execute(state_copy) current_value = self.model.get_value(state_copy) if current_value > best_action_value: best_action_value = current_value best_action = action if not info: return best_action if info: return best_action, best_action_value else: if not info: return None if info: return None, -1
def generate_all_tree_data_main_track(self, confidence_threshold: float = 0.1, count_threshold: int = 6, confidence_limit: int = 2): self.clean_memory() print('Collecting tree data.') X = [] Y = [] vertex = self.last_vertex if len(vertex.children) > 0: child_idx = random.randint(0, len(vertex.children) - 1) child_state = StateAsDict( vertex.children[child_idx].return_state()).to_state() child_state_inversed = StateAsDict( vertex.children[child_idx].return_state()).to_state() child_state_inversed.change_active_player() X.append(child_state) Y.append(vertex.children[child_idx].value_acc.get()) X.append(child_state_inversed) Y.append(-vertex.children[child_idx].value_acc.get()) while vertex.parent is not None: # take first: vertex_state = StateAsDict(vertex.return_state()).to_state() vertex_state_inversed = StateAsDict( vertex.return_state()).to_state() vertex_state_inversed.change_active_player() vertex_value = vertex.value_acc.get() X.append(vertex_state) Y.append(vertex_value) X.append(vertex_state_inversed) Y.append(-vertex_value) vertex = vertex.parent return {'state': X, 'mcts_value': Y}
def choose_act(self, mode, info=False): current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) list_of_actions = self.env.action_space.list_of_actions if list_of_actions: best_action = None best_action_value = -float('inf') for action in list_of_actions: state_copy = current_state_as_dict.to_state() action.execute(state_copy) state_copy.change_active_player() # print('*******************') current_value = self.evaluator.evaluate(state_copy) # print(f'State_copy = {StateAsDict(state_copy)}') # print(f'Action = {action} val = {current_value}') # print('------------------------------------') if current_value > best_action_value: best_action_value = current_value best_action = action if not info: return best_action if info: return best_action, best_action_value else: if not info: return None if info: return None, -1
def choose_act(self, mode): self.epsilon = self.epsilon*0.999995 if not self.explore: current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) list_of_actions = self.env.action_space.list_of_actions if not self.info: return self.model.choose_best_action(current_state_as_dict, list_of_actions) if self.info: return self.model.choose_best_action_with_q_value(current_state_as_dict, list_of_actions) if self.explore: current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) list_of_actions = self.env.action_space.list_of_actions p = np.random.uniform(0,1) best_action, best_eval = self.model.choose_best_action_with_q_value(current_state_as_dict, list_of_actions) if p >= self.epsilon: actual_action = best_action actual_eval = best_eval if p < self.epsilon: if list_of_actions: actual_action = random.choice(list_of_actions) actual_eval = self.model.get_q_value(current_state_as_dict, actual_action)[0] else: actual_action = None actual_eval = -1 best_eval = -1 return actual_action, actual_eval, best_eval
def choose_act(self, mode) -> Action: #first we load observation to the private environment current_points = self.env.current_state_of_the_game.active_players_hand( ).number_of_my_points() if len(self.env.action_space.list_of_actions) > 0: actions = [] potential_reward_max = self.action_to_avoid numerator = self.depth - 1 primary_state = StateAsDict(self.env.current_state_of_the_game) self.env_dict[numerator] = StateAsDict( self.env.current_state_of_the_game) for action in self.env.action_space.list_of_actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) potential_reward -= self.decay * self.deep_evaluation( action, numerator - 1, mode) self.restore_env(numerator) if self.collect_stats: self.stats_dataframe = self.stats_dataframe.append( { 'state': primary_state, 'action': action.to_dict(), 'evaluation': potential_reward }, ignore_index=True) self.stats_dataframe_vectorized = self.stats_dataframe_vectorized.append( { 'state_vector': vectorize_state(primary_state), 'action_vector': vectorize_action(action), 'evaluation': potential_reward }, ignore_index=True) if potential_reward > potential_reward_max: potential_reward_max = potential_reward actions = [] actions.append(action) elif potential_reward == potential_reward_max: actions.append(action) self.env.reset() self.env.load_state_from_dict(self.env_dict[numerator]) return random.choice(actions) else: return None
class DeterministicObservation(SplendorObservation): def __init__(self, state: State): super().__init__('deterministic') self.observation_dict = StateAsDict(state) def recreate_state(self): return self.observation_dict.to_state()
def deep_evaluation(self, action, numerator, mode): self.get_temp_env(action, numerator, mode) if numerator > 1: current_points = self.env.current_state_of_the_game.active_players_hand( ).number_of_my_points() self.env_dict[numerator] = StateAsDict( self.env.current_state_of_the_game) if len(self.env.action_space.list_of_actions) > 0: potential_reward_list = [] for action in self.env.action_space.list_of_actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) potential_reward -= self.decay * self.deep_evaluation( action, numerator - 1, mode) * pow( -1, self.depth - numerator + 1) potential_reward_list.append(potential_reward) self.restore_env(numerator) self.restore_env(numerator + 1) reward = max(potential_reward_list) else: reward = self.action_to_avoid else: reward = self.evaluate_actions() return reward
def choose_act(self, mode) -> Action: #first we load observation to the private environment current_points = self.env.current_state_of_the_game.active_players_hand( ).number_of_my_points() if len(self.env.action_space.list_of_actions) > 0: actions = [] points = [] numerator = self.depth - 1 self.env_dict[numerator] = StateAsDict( self.env.current_state_of_the_game) for action in self.env.action_space.list_of_actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) points.append(potential_reward) actions.append(action) values = set(points) if len(values) >= self.breadth: actions = [ actions[i] for i, point in enumerate(points) if point >= sorted(values)[-self.breadth] ] if len(actions) > 1: actions_ = [] points_ = [] for action in actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) potential_reward -= self.decay * self.deep_evaluation( action, numerator - 1, mode) points_.append(potential_reward) actions_.append(action) self.restore_env(numerator) actions = [ actions_[i] for i, point in enumerate(points_) if point >= sorted(set(points_))[-1] ] self.env.reset() self.env.load_state_from_dict(self.env_dict[numerator]) return random.choice(actions) else: return None
def generate_all_tree_data(self): self.clean_memory() all_code = '' # BFS kiu = [self.root] print('Collecting tree data.') while len(kiu) > 0: # take first: node_to_eval = kiu.pop(0) if node_to_eval.value_acc.count() > 0: for child in node_to_eval.children: if child.value_acc.count() > 0: kiu.append(child) child_state_as_dict = StateAsDict(child.return_state()) self.stats_dataframe = self.stats_dataframe.append( { 'state': child_state_as_dict.to_state(), 'mcts_value': child.value_acc.get() }, ignore_index=True) return self.stats_dataframe
def generate_all_tree_data_as_list(self, confidence_threshold: float = 0.1, count_threshold: int = 6, confidence_limit: int = 2): self.clean_memory() # BFS kiu = [self.root] confidence_count = 0 print('Collecting tree data.') X = [] Y = [] while len(kiu) > 0: # take first: node_to_eval = kiu.pop(0) if node_to_eval.value_acc.count() > 0: for child in node_to_eval.children: # if child.value_acc.count() >= count_threshold or child.value_acc.perfect_value is not None: if child.value_acc.get_confidence( ) >= confidence_threshold: confidence_count += 1 if ( child.value_acc.get_confidence() >= confidence_threshold and confidence_count <= confidence_limit) \ or child.value_acc.count() >= count_threshold: kiu.append(child) child_state_as_dict = StateAsDict(child.return_state()) current_state = child_state_as_dict.to_state() current_state_inversed = child_state_as_dict.to_state() current_state_inversed.change_active_player() current_value = child.value_acc.get() X.append(current_state) Y.append(current_value) X.append(current_state_inversed) Y.append(-current_value) return {'state': X, 'mcts_value': Y}
def eval_leaf(self, state: State) -> float: mode = "deterministic" self.env.update_actions_light() current_points = self.env.current_state_of_the_game.active_players_hand( ).number_of_my_points() if len(self.env.action_space.list_of_actions) > 0: actions = [] points = [] numerator = self.depth - 1 self.env_dict[numerator] = StateAsDict( self.env.current_state_of_the_game) for action in self.env.action_space.list_of_actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) points.append(potential_reward) actions.append(action) values = set(points) if len(values) >= self.breadth: actions = [ actions[i] for i, point in enumerate(points) if point >= sorted(values)[-self.breadth] ] if len(actions) > 1: actions_ = [] points_ = [] for action in actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) potential_reward -= self.decay * self.deep_evaluation( action, numerator - 1, mode) points_.append(potential_reward) self.restore_env(numerator) self.env.reset() self.env.load_state_from_dict(self.env_dict[numerator]) return max(points_) else: return -100
def evaluate_states(files_dir, dump_dir): evaluator = ValueFunction() list_of_files = os.listdir(files_dir) for file_name in list_of_files: with open(os.path.join(files_dir, file_name), 'rb') as f: X, _ = pickle.load(f) Y = [] for x in X: state_to_eval = StateAsDict(x).to_state() Y.append(evaluator.evaluate(state_to_eval)) del state_to_eval with open(os.path.join(dump_dir, file_name), 'wb') as f: pickle.dump((X, Y), f) print(len(X)) del X del Y
def load_state_from_dict(self, state_as_dict: StateAsDict): self.current_state_of_the_game = state_as_dict.to_state() self.is_done = False
def __init__(self, state: State): super().__init__('deterministic') self.observation_dict = StateAsDict(state)
def dict_to_state(s_dict): s_as_dict = StateAsDict() s_as_dict.load_from_dict(s_dict) state_to_return = s_as_dict.to_state() state_to_return.active_player_id = 0 return state_to_return
def choose_action(self, state : State) ->Action: list_of_actions = generate_all_legal_actions(state) return self.model.choose_best_action(StateAsDict(state), list_of_actions)
def run_one_duel(self, list_of_agents: List[Agent], starting_agent_id: int = 0, render_game: bool = False) -> GameStatisticsDuels: """Runs one game between two agents. :param: list_of_agents: List of agents to play, they will play in the order given by the list starting_agent_id: Id of the agent who starts the game. show_game: If True, GUI will appear showing the game. """ #prepare the game self.env.reset() self.env.set_active_player(starting_agent_id) #set players names: self.env.set_players_names([agent.name for agent in list_of_agents]) is_done = False #set the initial agent id active_agent_id = starting_agent_id #set the initial observation full_state = self.env.current_state_of_the_game number_of_actions = 0 results_dict = {} #Id if the player who first reaches number of points to win first_winner_id = None checked_all_players_after_first_winner = False previous_actions = [None] if render_game: self.env.render() time.sleep(GAME_INITIAL_DELAY) while number_of_actions < MAX_NUMBER_OF_MOVES and not ( is_done and checked_all_players_after_first_winner): action = list_of_agents[ active_agent_id].deterministic_choose_action( full_state, previous_actions) if action is None: print('None action by {}'.format( list_of_agents[active_agent_id].name)) print('Current state of the game') state_str = StateAsDict(self.env.current_state_of_the_game) print(state_str) previous_actions = [action] if render_game: self.env.render() full_state, reward, is_done, info = self.env.deterministic_step( action) if is_done: results_dict[list_of_agents[active_agent_id].my_name_with_id()] = \ OneAgentStatistics(reward, self.env.points_of_player_by_id(active_agent_id), int(reward == 1)) if first_winner_id is None: first_winner_id = active_agent_id checked_all_players_after_first_winner = active_agent_id == ( first_winner_id - 1) % len(list_of_agents) active_agent_id = (active_agent_id + 1) % len(list_of_agents) number_of_actions += 1 one_game_statistics = GameStatisticsDuels(list_of_agents) one_game_statistics.register_from_dict(results_dict) print(time.time()) return one_game_statistics
'name': 'RandomAgent - uniform_on_types ' }, 'board': { 'nobles_on_board': {104, 108, 102}, 'cards_on_board': {33, 4, 69, 71, 72, 41, 9, 10, 44, 79, 86, 26}, 'gems_on_board': [2, 4, 3, 3, 4, 3], 'deck_order': [{ 'Row.CHEAP': [ 38, 0, 76, 18, 55, 19, 3, 40, 7, 43, 23, 39, 37, 42, 77, 75, 25, 59, 2, 5, 54, 73, 56, 21, 57, 22, 1, 36, 6, 60, 61, 78, 74, 58, 24 ] }, { 'Row.MEDIUM': [ 12, 49, 47, 31, 45, 62, 11, 81, 46, 80, 66, 67, 48, 83, 82, 65, 8, 84, 13, 29, 27, 64, 85, 63, 30 ] }, { 'Row.EXPENSIVE': [35, 53, 87, 88, 15, 14, 50, 17, 16, 68, 70, 32, 52, 51, 89, 34] }] }, 'active_player_id': 1 } stanek = StateAsDict() stanek.load_from_dict(dict) fufu = SplendorGUI() fufu.draw_state(stanek.to_state()) fufu.keep_window_open(300)
def run_one_game_and_collect_data(self, debug_info=True): last_value_player_0 = None last_value_player_1 = None old_value = None self.env.reset() observation = self.env.show_observation('deterministic') is_done = False number_of_moves = 0 debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value')) collected_data = pd.DataFrame(columns=('state_as_vector', 'value')) extra_move_done = False while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES: if is_done: extra_move_done = True action, best_value = self.agent.choose_action(observation, [None], info=True) #print('best value = {}'.format(best_value)) observation, reward, is_done, info = self.env.step('deterministic', action) previous_player_id = self.env.previous_player_id() winner_id = info['winner_id'] current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) if previous_player_id == 0: old_value = last_value_player_0 if previous_player_id == 1: old_value = last_value_player_1 if debug_info: debug_collected_data = debug_collected_data.append({ 'new_value': self.new_value_formula(old_value, best_value, winner_id, reward, alpha=0.1), 'active_player_id' : self.env.previous_player_id(), 'winner_id' : winner_id, 'reward' : reward, 'best_value' : best_value, 'old_value': old_value, 'pa_points' : self.env.previous_players_hand().number_of_my_points()}, ignore_index=True) collected_data = collected_data.append({'state_as_vector' : vectorize_state(current_state_as_dict), 'value': self.new_value_formula(old_value, best_value, winner_id, reward, alpha=0.1)}, ignore_index=True) if previous_player_id == 0: last_value_player_0 = best_value if previous_player_id == 1: last_value_player_1 = best_value #let the opponent move: number_of_moves += 1 if debug_info: debug_collected_data.to_csv('debug_info.csv') collected_data = collected_data.iloc[2:] return collected_data
def run_one_game_and_collect_data(self, debug_info=True): there_was_no_action = False self.agent.train_mode() last_actual_player_0 = None last_actual_player_1 = None last_state_player_0 = None last_state_player_1 = None last_action_vec_player_0 = None last_action_vec_player_1 = None old_value = None old_state = None old_action_vec = None self.env.reset() observation = self.env.show_observation('deterministic') is_done = False number_of_moves = 0 debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value')) collected_data = pd.DataFrame(columns=('state_as_vector', 'value')) extra_move_done = False while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES: if is_done: extra_move_done = True current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) actual_action, actual_eval, best_eval = self.agent.choose_action(observation, [None]) if actual_action is None: there_was_no_action = True break #print('best value = {}'.format(best_value)) observation, reward, is_done, info = self.env.step('deterministic', actual_action) previous_player_id = self.env.previous_player_id() winner_id = info['winner_id'] if previous_player_id == 0: old_value = last_actual_player_0 old_state = last_state_player_0 old_action_vec = last_action_vec_player_0 if previous_player_id == 1: old_value = last_actual_player_1 old_state = last_state_player_1 old_action_vec = last_action_vec_player_1 if debug_info: state_status = old_state.__repr__() if old_state is not None else 'NONE' state_vector = vectorize_state(old_state) if old_state is not None else 'NONE' debug_collected_data = debug_collected_data.append({ 'state_ex' : state_status, 'state_vec' : state_vector, 'new_value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha), 'active_player_id' : self.env.previous_player_id(), 'winner_id' : winner_id, 'reward' : reward, 'best_eval' : best_eval, 'actual_eval' : actual_eval, 'old_value': old_value, 'pa_points' : self.env.previous_players_hand().number_of_my_points()}, ignore_index=True) if old_state is not None: collected_data = collected_data.append({'state_as_vector' : vectorize_state(old_state), 'action_vector' : old_action_vec, 'value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha)}, ignore_index=True) if previous_player_id == 0: last_actual_player_0 = actual_eval last_state_player_0 = current_state_as_dict last_action_vec_player_0 = vectorize_action(actual_action) if previous_player_id == 1: last_actual_player_1 = actual_eval last_state_player_1 = current_state_as_dict last_action_vec_player_1 = vectorize_action(actual_action) #let the opponent move: number_of_moves += 1 if debug_info: debug_collected_data.to_csv('debug_info.csv') collected_data = collected_data.iloc[0:] self.agent.test_mode() return collected_data