def step(self, move=None): """ Perform one step of the environment: agents in turn choose a move :param move: externally determined move to be performed by agent0 (useful for training) :return: reward accumulated in this step, boolean: if environment in terminal state, boolean: if agent0 won """ self.reward = 0 self.steps += 1 # illegal move as step # are there still pieces to be moved? if not helpers.get_poss_moves(self.board, team=0): self.reward += self.reward_loss self.score += self.reward return self.reward, True, -2 # 0 for lost # move decided by agent or externally? if move is not None: agent_move = move # this enables working with the environment in external functions (e.g. train.py) else: agent_move = self.agents[0].decide_move() # is move legal? if not helpers.is_legal_move( self.board, agent_move ): # if illegal -> no change in env, receive reward_illegal self.reward += self.reward_illegal self.illegal_moves += 1 # print("Warning: agent 1 selected an illegal move: {}".format(agent_move)) self.score += self.reward done, won = self.goal_test() return self.reward, done, won # environment does not change for illegal self.do_move(agent_move, team=0) self.move_count += 1 # opponents move if self.opp_can_move: # only if opponent is playing, killing his pieces wins (opponent can be e.g. flag only) # are there still pieces to be moved? if not helpers.get_poss_moves(self.board, team=1): self.reward += self.reward_win self.score = self.reward return self.reward, True, 2 # 1 for won opp_move = self.agents[1].decide_move() # is move legal? if not helpers.is_legal_move( self.board, opp_move ): # opponent is assumed to only perform legal moves pass # print("Warning: agent 1 selected an illegal move: {}".format(opp_move)) self.do_move(opp_move, team=1) # assuming only legal moves selected self.move_count += 1 done, won = self.goal_test() self.score += self.reward return self.reward, done, -1 + 2 * won
def min_val(self, board, current_reward, alpha, beta, depth): """ Step of the minimizing player in the minimax algorithm. See max_val for documentation. """ # this is what the opponent will think, the min-player # get my possible actions, then shuffle them to ensure randomness when no action # stands out as the best my_doable_actions = helpers.get_poss_moves(board, self.other_team) np.random.shuffle(my_doable_actions) # check for terminal-state scenario or maximum depth done, won = self.goal_test(my_doable_actions, board, max_val=False) if done or depth == 0: return current_reward + self.get_terminal_reward(done, won, depth), None val = float('inf') # initial value set, so min comparison later possible best_action = None # iterate through all actions for action in my_doable_actions: board, fight_result = self.do_move(action, board=board, bookkeeping=False, true_gameplay=False) temp_reward = current_reward - self.add_temp_reward(fight_result) new_val = self.max_val(board, temp_reward, alpha, beta, depth-1)[0] if val > new_val: val = new_val best_action = action if val <= alpha: self.undo_last_move(board) return val, best_action beta = min(beta, val) board = self.undo_last_move(board) return val, best_action
def poss_actions(self, action_dim): """ Converting set of possible moves in the whole game to a set of actions for the agent :param action_dim: how many actions are possible for agent :return: list of legal actions """ poss_moves = helpers.get_poss_moves(self.board, self.team) # which moves are possible in the game poss_actions = [] all_actions = range(0, action_dim) for action in all_actions: move = self.action_to_move(action) # converting all actions to moves (which can be illegal) if move in poss_moves: # only select legal moves among them poss_actions.append(action) return poss_actions
def decide_move(self): """ given the maximum depth, copy the known board so far, assign the pieces by random, while still respecting the current knowledge, and then decide the move via minimax algorithm. :return: tuple of position tuples """ possible_moves = helpers.get_poss_moves(self.board, self.team) next_action = None if possible_moves: values_of_moves = dict.fromkeys(possible_moves, 0) for move in possible_moves: for draw in range(self._nr_of_enemy_setups_to_draw): curr_board = self.draw_consistent_enemy_setup(copy.deepcopy(self.board)) curr_board, _ = self.do_move(move, curr_board, bookkeeping=False, true_gameplay=False) values_of_moves[move] += self.approximate_value_of_board(curr_board) / self._nr_of_enemy_setups_to_draw self.undo_last_move(curr_board) evaluations = list(values_of_moves.values()) actions = list(values_of_moves.keys()) next_action = actions[evaluations.index(max(evaluations))] return next_action
def max_val(self, board, current_reward, alpha, beta, depth): """ Do the max players step in the minimax algorithm. Check first if the given board is in a terminal state. If not, we will do each possible move once and send the process to min_val to do the min players step. :param board: the current board, numpy array :param current_reward: the current value the path has accumulated :param alpha: alpha threshold of the minimax alg :param beta: beta threshold of the minimax alg :param depth: the depth the process is at, integer :return: tuple of best value, and a associated best_action (float, tuple) """ # this is what the expectimax agent will think # get my possible actions, then shuffle them to ensure randomness when no action # stands out as the best my_doable_actions = helpers.get_poss_moves(board, self.team) np.random.shuffle(my_doable_actions) # check for terminal-state scenario done, won = self.goal_test(my_doable_actions, board, max_val=True) if done or depth == 0: return current_reward + self.get_terminal_reward(done, won, depth), None val = -float('inf') best_action = None for action in my_doable_actions: board, fight_result = self.do_move(action, board=board, bookkeeping=False, true_gameplay=False) temp_reward = current_reward + self.add_temp_reward(fight_result) new_val = self.min_val(board, temp_reward, alpha, beta, depth-1)[0] if val < new_val: val = new_val best_action = action if val >= beta: self.undo_last_move(board) best_action = action return val, best_action alpha = max(alpha, val) board = self.undo_last_move(board) return val, best_action
def approximate_value_of_board(self, board): """ Simulate the game to the max number of turns a lot of times and evaluating the simulation by whether he won and how many more pieces he has left than the opponent. :param board: :return: """ finished = False turn = 0 evals = [] for i in range(self._nr_iterations_of_game_sim): board_copy = copy.deepcopy(board) while not finished: actions = helpers.get_poss_moves(board_copy, turn) if actions: # as long as actions are left to be done, we do them move = random.choice(actions) board_copy, _ = self.do_move(move, board_copy) # check whether the game is terminal done, won = self.goal_test(actions, board_copy, turn) if done: # if terminal, calculate the bonus we want to reward this simulation with my_team = self.get_team_from_board(board, self.team) enemy_team = self.get_team_from_board(board, self.other_team) bonus = (len(my_team) - len(enemy_team)) / 20 # -1+2*won equals -1+2*0=-1 for won=False, and -1+2*1=1 for won=True # bonus is negative if enemy team has more pieces evals.append(-1 + 2 * won + bonus) finished = True elif turn > self._nr_of_max_turn_sim: # check if we reached the max number of turns # calculate bonus my_team = self.get_team_from_board(board, self.team) enemy_team = self.get_team_from_board(board, self.other_team) bonus = (len(my_team) - len(enemy_team)) / 20 # -1+2*won equals -1+2*0=-1 for won=False, and -1+2*1=1 for won=True # bonus is negative if enemy team has more pieces evals.append(bonus) finished = True turn = (turn + 1) % 2 return sum(evals)/len(evals)
def decide_move(self): actions = helpers.get_poss_moves(self.board, self.team) if not actions: return None else: return random.choice(actions)