def value_move(board,active_turn,output_fun,exploration): board = board.reshape((1,9)) X_sym = theano.tensor.matrix() y_sym = theano.tensor.ivector() player_dict = {'X':1, 'O':-1} dummy_board = player_dict[active_turn] * board[:] options = ttt.available_moves(dummy_board) if exploration > random.random(): move = random.choice(options) else: move_values = np.zeros(9) for move in options: dummy_board = player_dict[active_turn] * board[:] dummy_board[0][move] = 1 move_values[move] = -1 * output_fun(-1* dummy_board) available_move_values = np.array([move_values[move] for move in options]) move = options[available_move_values.argmax(-1)] return move + 1
def alpha_beta_move(board,active_turn,depth,alpha = 2): swap_dict = {'X':'O','O':'X'} dummy_board = np.arange(9) dummy_board[:] = board[:] options = ttt.available_moves(board) random.shuffle(options) player_dict = {'X':1, 'O':-1} if len(options) == 1: dummy_board[options[0]] = player_dict[active_turn] if ttt.winner(dummy_board): return (1,options[0]+1) else: return (0,options[0]+1) if depth ==0: return (0, options[np.random.randint(len(options))]+1) best_value = -2 candidate_move = None for x in options: dummy_board[x] = player_dict[active_turn] if ttt.winner(dummy_board): return (1, x+1) (opp_value,opp_move) = alpha_beta_move(dummy_board,swap_dict[active_turn],depth-1,-best_value) if -opp_value > best_value: candidate_move = x+1 best_value = -opp_value if -opp_value >= alpha: #print (options, x, best_value, alpha) break dummy_board[x] = board[x] return (best_value, candidate_move)
def monte_carlo_sample(board_state, side): """Sample a single rollout from the current board_state and side. Moves are made to the current board_state until we reach a terminal state then the result and the first move made to get there is returned. Args: board_state (3x3 tuple of int): state of the board side (int): side currently to play. +1 for the plus player, -1 for the minus player Returns: (result(int), move(int,int)): The result from this rollout, +1 for a win for the plus player -1 for a win for the minus player, 0 for a draw """ result = has_winner(board_state) if result != 0: return result, None moves = list(available_moves(board_state)) if not moves: return 0, None # select a random move move = random.choice(moves) result, next_move = monte_carlo_sample(apply_move(board_state, move, side), -side) return result, move
def alpha_beta_move(board, turn, depth = 0, alpha = (-inf,-inf), beta = (inf,inf), evaluation = lambda x: 0): dummy_board = np.copy(board).reshape(9) # we don't want to change the board state swap_player = {1:-1,-1:1} # So we can change whose turn options = ttt.available_moves(board) # get legal moves random.shuffle(options) # should inherit move order instead of randomizing best_value = (-inf,-inf) if not options: print board, cccc.game_over(board) print 'oops, no available moves' cand_move = options[0] if depth == 0: for x in options: update_move(dummy_board,x,turn) op_value = (evaluation(dummy_board*swap_player[turn]) , depth) if tuple(-1 * el for el in op_value) > best_value: cand_move = x best_value = tuple(-1 * el for el in op_value) alpha = max(alpha, best_value) if alpha >= beta: break #alpha-beta cutoff unupdate_move(dummy_board,x) else: for x in options: update_move(dummy_board,x,turn) if ttt.winner(dummy_board): #should check over and tied too return((inf,depth), x) if ttt.is_full(dummy_board): #This assumes you can't lose on your turn return((0,depth) , x) op_value,_ = alpha_beta_move( dummy_board, swap_player[turn], depth-1, alpha = tuple(-1 * el for el in beta), beta = tuple(-1 * el for el in alpha), evaluation = evaluation) if tuple(-1 * el for el in op_value) > best_value: cand_move = x best_value = tuple(-1 * el for el in op_value) alpha = max(alpha, best_value) # print depth,-op_value, best_value, cand_move,alpha,beta if alpha >= beta: # print 'pruned' break #alpha-beta cutoff unupdate_move(dummy_board,x) # dummy_board[height, x] = 0 return (best_value, cand_move)
def min_max_alpha_beta(board_state, side, max_depth, evaluation_func=evaluate, alpha=-sys.float_info.max, beta=sys.float_info.max): """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best move Args: board_state (3x3 tuple of int): The board state we are evaluating side (int): either +1 or -1 max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the position is. evaluation_func (board_state -> int): Function used to evaluate the position for the plus player alpha (float): Used when this is called recursively, normally ignore beta (float): Used when this is called recursively, normally ignore Returns: (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was """ best_score_move = None moves = list(available_moves(board_state)) if not moves: return 0, None for move in moves: new_board_state = apply_move(board_state, move, side) winner = has_winner(new_board_state) if winner != 0: return winner * 10000, move else: if max_depth <= 1: score = evaluation_func(new_board_state) else: score, _ = min_max_alpha_beta(new_board_state, -side, max_depth - 1, alpha, beta) if side > 0: if score > alpha: alpha = score best_score_move = move else: if score < beta: beta = score best_score_move = move if alpha >= beta: break return alpha if side > 0 else beta, best_score_move
def get_max_future(future_board,value_fun): options = ttt.available_moves(future_board) dummy_board = np.copy(future_board) move_values = np.zeros(9) for move in options: dummy_board = np.copy(future_board) dummy_board[move] = -1 dummy_board = dummy_board.reshape(1,9) if ttt.winner(dummy_board): move_values[move] = ttt.winner(dummy_board) else: move_values[move] = value_fun(dummy_board) available_move_values = np.array([move_values[move] for move in options]) dummy_board = np.copy(future_board) options_index = np.argmin(available_move_values) dummy_board[options[options_index]] = -1 return np.amin(available_move_values), dummy_board
def min_max(board_state, side, max_depth, evaluation_func=evaluate): """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best move Args: board_state (3x3 tuple of int): The board state we are evaluating side (int): either +1 or -1 max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the position is. evaluation_func (board_state -> int): Function used to evaluate the position for the plus player Returns: (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was """ best_score = None best_score_move = None moves = list(available_moves(board_state)) if not moves: # this is a draw return 0, None for move in moves: new_board_state = apply_move(board_state, move, side) winner = has_winner(new_board_state) if winner != 0: return winner * 10000, move else: if max_depth <= 1: score = evaluation_func(new_board_state) else: score, _ = min_max(new_board_state, -side, max_depth - 1) if side > 0: if best_score is None or score > best_score: best_score = score best_score_move = move else: if best_score is None or score < best_score: best_score = score best_score_move = move return best_score, best_score_move
def policy_move(board,active_turn,output_fun,exploration): board = board.reshape((1,9)) X_sym = theano.tensor.matrix() y_sym = theano.tensor.ivector() player_dict = {'X':1, 'O':-1} dummy_board = player_dict[active_turn] * board[:] #make 1s good and -1s bad move_weights = output_fun(dummy_board) move_weights = move_weights.reshape(9) options = ttt.available_moves(dummy_board) if exploration > random.random(): move = random.choice(options) else: available_move_weights = np.array([move_weights[i] for i in options]) move = options[available_move_weights.argmax(-1)] return move+1
def monte_carlo(board,epsilon = 0.5,duration = 1,player=1): plays = {} results = {} t0 = time.clock() plays[tuple(board)] = 0 results[tuple(board)]=0 while time.clock()-t0 < duration: current_player = player dummy_board = np.copy(board) branch = [(np.copy(dummy_board),current_player)] while not game_over(dummy_board): options = ttt.available_moves(dummy_board) future_boards = [next_board(dummy_board,move,current_player) for move in options] if all(plays.get(tuple(b)) for b in future_boards): if random.random() > epsilon: dummy_board = random.choice(future_boards) else: #min here because you are maximizing over future boards, which the results are given in terms of the #current player, i.e. the other player. dummy_board = min(future_boards,key = lambda x : results[tuple(x)] / float(plays[tuple(x)])) else: dummy_board = random.choice(future_boards) plays[tuple(dummy_board)] = 0 results[tuple(dummy_board)]=0 current_player *= -1 branch.append((np.copy(dummy_board),current_player)) for b,p in branch: plays[tuple(b)] +=1 results[tuple(b)] += p * ttt.winner(dummy_board) return results[tuple(board)] / float(plays[tuple(board)])
def mc_step(branch,results,epsilon, cutoff = 10000): dummy_board = np.copy(branch[-1]) #To help convergence we will randomly drop stored values #if random.random() < 1/float(cutoff): # results[tuple(dummy_board)] = {'result':0,'plays':0} if not results.get(tuple(dummy_board)): results[tuple(dummy_board)] = {'result':0,'plays':0} board_plays = results[tuple(dummy_board)]['plays'] board_result = results[tuple(dummy_board)]['result'] if game_over(dummy_board): result = ttt.winner(dummy_board) elif board_plays> cutoff: result = results[tuple(dummy_board)]['result'] / float(results[tuple(dummy_board)]['plays']) else: options = ttt.available_moves(dummy_board) future_boards = [next_board(dummy_board,move,1) for move in options] if all(results.get(tuple(-1 * b)) for b in future_boards): if epsilon(board_plays) > random.random(): dummy_board = random.choice(future_boards) else: dummy_board = min(future_boards,key = lambda x : results[tuple(-1 * x)]['result'] / float(results[tuple(-1 * x)]['plays'])) else: dummy_board = random.choice(future_boards) branch.append(-1 * np.copy(dummy_board)) result , _ = mc_step(branch,results,epsilon,cutoff) result = -1 * result return result , branch
def random_move(board,turn): options = ttt.available_moves(board) move = random.choice(options) dummy_board = np.copy(board) dummy_board[move] = turn return dummy_board
def monte_carlo_tree_search_uct(board_state, side, number_of_samples): """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper confidence bounds for trees. Args: board_state (3x3 tuple of int): state of the board side (int): side currently to play. +1 for the plus player, -1 for the minus player number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the better the estimation of the position Returns: (result(int), move(int,int)): The average result for the best move from this position and what that move was. """ state_results = collections.defaultdict(float) state_samples = collections.defaultdict(float) for _ in range(number_of_samples): current_side = side current_board_state = board_state first_unvisited_node = True rollout_path = [] result = 0 while result == 0: move_states = { move: apply_move(current_board_state, move, current_side) for move in available_moves(current_board_state) } if not move_states: result = 0 break if all((state in state_samples) for _, state in move_states): log_total_samples = math.log( sum(state_samples[s] for s in move_states.values())) move, state = max( move_states, key=lambda _, s: _upper_confidence_bounds( state_results[s], state_samples[s], log_total_samples)) else: move = random.choice(list(move_states.keys())) current_board_state = move_states[move] if first_unvisited_node: rollout_path.append((current_board_state, current_side)) if current_board_state not in state_samples: first_unvisited_node = False current_side = -current_side result = has_winner(current_board_state) for path_board_state, path_side in rollout_path: state_samples[path_board_state] += 1. result *= path_side # normalize results to be between 0 and 1 before this it between -1 and 1 result /= 2. result += .5 state_results[path_board_state] += result move_states = { move: apply_move(board_state, move, side) for move in available_moves(board_state) } move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[ move_states[x]]) return state_results[move_states[move]] / state_samples[ move_states[move]], move