def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF): print '-', # BEGIN_YOUR_CODE if game.is_end(state): return game.utility(state) actions = game.get_possible_actions(state) player = game.get_player_from_state(state) if player == game.MAX_PLAYER: value = -game.INT_INF for action in actions: value = max( value, self.V(game.get_next_state(state, action), alpha, beta)) alpha = max(alpha, value) if beta <= alpha: break else: value = game.INT_INF for action in actions: value = min( value, self.V(game.get_next_state(state, action), alpha, beta)) beta = min(beta, value) if beta <= alpha: break return value
def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF): # If IsEnd(s) if game.is_end(state): return game.utility(state) # Get possible actions actions = game.get_possible_actions(state) assert len(actions) > 0 # If player == agent (maximizing player) if game.get_player_from_state(state) == game.MAX_PLAYER: value = -game.INT_INF for action in actions: value = max(value, self.V(game.get_next_state(state, action), alpha, beta)) alpha = max(alpha, value) if beta <= alpha: break # If player == opponent (minimzing player) else: value = game.INT_INF for action in actions: value = min(value, self.V(game.get_next_state(state, action), alpha, beta)) beta = min(beta, value) if beta <= alpha: break return value
def V(self, state): # If IsEnd(s) if game.is_end(state): return game.utility(state) # Get possible actions actions = game.get_possible_actions(state) assert len(actions) > 0 # If player == agent (maximizing player) if game.get_player_from_state(state) == game.MAX_PLAYER: value = -game.INT_INF for action in actions: value = max(value, self.V(game.get_next_state( state, action))) # use 'game.get_next_state' # use 'game.get_next_state' # value = max(self.V(game,get_next_state(state,action)) for action in actions) # use 'game.get_next_state' # If player == opponent (minimzing player) else: value = game.INT_INF for action in actions: value = min(value, self.V(game.get_next_state( state, action))) # use 'game.get_next_state' return value
def V(self, state, depth): # If IsEnd(s) if game.is_end(state): return game.utility(state) # If depth = 0 if depth == 0: #print game.get_board_str(state), eval(state) return eval(state) # Get possible actions actions = game.get_possible_actions(state) assert len(actions) > 0 # If player == agent (maximizing player) if game.get_player_from_state(state) == game.MAX_PLAYER: value = -game.INT_INF for action in actions: value = max(value, self.V(game.get_next_state(state, action), depth)) # If player == opponent (minimzing player) else: value = game.INT_INF for action in actions: value = min( value, self.V(game.get_next_state(state, action), depth - 1)) return value
def policy(self, state): # BEGIN_YOUR_CODE actions = game.get_possible_actions(state) alpha = -game.INT_INF beta = game.INT_INF player = game.get_player_from_state(state) if player == game.MAX_PLAYER: values = [] for action in actions: next_state = game.get_next_state(state, action) value = self.V(next_state, alpha, beta) values.append(value) alpha = max(alpha, value) if beta <= alpha: break idx = mp.argmax(values) return actions[idx] # return actions[mp.argmax([self.V(game.get_next_state(state, action)) for action in actions])] else: values = [] for action in actions: next_state = game.get_next_state(state, action) value = self.V(next_state, alpha, beta) values.append(value) beta = min(beta, value) if beta <= alpha: break idx = mp.argmin(values) return actions[idx]
def policy(self, state): # BEGIN_YOUR_CODE actions = game.get_possible_actions(state) player = game.get_player_from_state(state) if player == game.MAX_PLAYER: return actions[mp.argmax([ self.V(game.get_next_state(state, action)) for action in actions ])] else: return actions[mp.argmin([ self.V(game.get_next_state(state, action)) for action in actions ])]
def V(self, state): # BEGIN_YOUR_CODE if game.is_end(state): return game.utility(state) player = game.get_player_from_state(state) if player == game.MAX_PLAYER: value = -game.INT_INF for action in game.get_possible_actions(state): value = max(value, self.V(game.get_next_state(state, action))) else: value = game.INT_INF for action in game.get_possible_actions(state): value = min(value, self.V(game.get_next_state(state, action))) return value
def user_turn(state): game.draw_board(state) while True: print('What is your next move? (1-9):', end=' ') action = int(input()) if state[action] == game.EMPTY: break state = game.get_next_state(state, action) if game.is_win(state): game.draw_board(state) print('Lose!') return None if game.is_lose(state): game.draw_board(state) print('Win!') return None if game.is_draw(state): game.draw_board(state) print('Draw!') return None return state
def run_episode(self): """ Runs one episode of self-play, starting with player 1, and return a training sample containing (canon_state, policy, value) tuples. """ train_samples = [] state = game.get_init_state() current_player = 1 episode_step = 0 while True: episode_step += 1 canon_state = game.get_canonical_form(state, current_player) temp = int(episode_step < self.config.temperature_threshold) policy = self.mcts.get_move_probabilities(canon_state, temp=temp) sym = game.get_symmetries(canon_state, policy) for s, p in sym: train_samples.append([s, current_player, p, None]) move = np.random.choice(len(policy), p=policy) state, current_player = game.get_next_state(state, current_player, move) r = game.get_state_score(state, current_player) if r != 0: return [ (s, pcy, r * ((-1) ** (pyr != current_player))) for s, pyr, pcy, _ in train_samples ]
def policy(self, state): actions = game.get_possible_actions(state) assert len(actions) > 0 if game.get_player_from_state(state) == game.MAX_PLAYER: return max(actions, key=lambda x: self.V(game.get_next_state(state, x))) else: return random.choice(actions)
def policy(self, state): actions = game.get_possible_actions(state) assert len(actions) > 0 optimal = max if game.get_player_from_state( state) == game.MAX_PLAYER else min return optimal(actions, key=lambda x: self.V(game.get_next_state(state, x)))
def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF): # BEGIN_YOUR_CODE if game.is_end(state): return game.utility(state) actions = game.get_possible_actions(state) if game.get_player_from_state(state) == game.MAX_PLAYER: # my-turn value = -game.INT_INF for action in actions: value = max(value, self.V(game.get_next_state(state, action))) else: # opp-turn value = 0 for action in actions: value += self.V(game.get_next_state(state, action)) / len(actions) return value
def V(self, state, depth): # BEGIN_YOUR_CODE if game.is_end(state): return game.utility(state) if depth == 0: return eval(state) if game.get_player_from_state(state) == game.MAX_PLAYER: # my-turn value = -game.INT_INF for action in game.get_possible_actions(state): value = max(value, self.V(game.get_next_state(state, action), depth)) else: # opp-turn value = game.INT_INF for action in game.get_possible_actions(state): value = min( value, self.V(game.get_next_state(state, action), depth - 1)) return value
def policy(self, state): actions = game.get_possible_actions(state) assert len(actions) > 0 alpha = -game.INT_INF beta = game.INT_INF if game.get_player_from_state(state) == game.MAX_PLAYER: values = [] for action in actions: value = self.V(game.get_next_state(state, action), alpha, beta) values.append(value) alpha = max(alpha, value) return max(list(zip(actions, values)), key=lambda x: x[1])[0] else: values = [] for action in actions: value = self.V(game.get_next_state(state, action), alpha, beta) values.append(value) beta = min(beta, value) return min(list(zip(actions, values)), key=lambda x: x[1])[0]
def policy(self, state): # BEGIN_YOUR_CODE actions = game.get_possible_actions(state) player = game.get_player_from_state(state) if player == game.MAX_PLAYER: values = [] for action in actions: next_state = game.get_next_state(state, action) value = self.V(next_state, self.max_depth) values.append(value) idx = mp.argmax(values) return actions[idx] else: values = [] for action in actions: next_state = game.get_next_state(state, action) value = self.V(next_state, self.max_depth) values.append(value) idx = mp.argmin(values) return actions[idx]
def play_game(self): """ Run one episode and return the winner of the game (1 if player1, -1 if player2) or a draw result that is neither 1, -1, nor 0 """ players = [self.player2, None, self.player1] current_player = 1 state = game.get_init_state() while game.get_state_score(state, current_player) == 0: move = players[current_player + 1](game.get_canonical_form( state, current_player)) legal_moves = game.get_legal_moves( game.get_canonical_form(state, current_player), 1) if legal_moves[move] == 0: print(move) assert legal_moves[move] > 0 state, current_player = game.get_next_state( state, current_player, move) return current_player * game.get_state_score(state, current_player)
def system_turn(state): action = agent.policy(state) print('action =', action) state = game.get_next_state(state, action) if game.is_win(state): game.draw_board(state) print('Win!') return None if game.is_lose(state): game.draw_board(state) print('Lose!') return None if game.is_draw(state): game.draw_board(state) print('Draw!') return None return state
def search(self, state): """ One iteration of MCTS. This method is recursively called until a leaf node is found. The action chosen at each node is the one with the maximum upper confidence bound. Returns: v: the negative of the value of the current state """ s = game.hash_state(state) if s not in self.states_ending_score: self.states_ending_score[s] = game.get_state_score(state, 1) if self.states_ending_score[s] != 0: # terminal node: outcome propagated up the search path return -self.states_ending_score[s] # leaf node: neural net is used to get an initial policy and value for the state if s not in self.states_P: # transform state by using a randomly selected symmetry before it is evaluated # by the NN, so that the MC evaluation is averaged over different biases transformed_state = random.choice(game.get_symmetries(state)) self.states_P[s], v = self.neural_net.predict(transformed_state) legal_moves = game.get_legal_moves(state, 1) # put 0 in the policy for illegal moves self.states_P[s] = self.states_P[s] * legal_moves # renormalize the policy policy_sum = self.states_P[s].sum().item() if policy_sum > 0: self.states_P[s] /= policy_sum else: # if all legal moves probabilities are 0, let all legal moves probabilities be equal # print something here as it is not expected to get this message often print( "All legal moves probabilities are 0! Replacing with uniform distribution..." ) self.states_P[s] = self.states_P[s] + legal_moves self.states_P[s] /= np.sum(self.states_P[s]) self.states_valid_moves[s] = legal_moves self.states_N[s] = 0 # the value is propagated up the search path return -v legal_moves = self.states_valid_moves[s] current_best = -float("inf") best_move = -1 # pick the action with the highest upper confidence bound for a in range(game.ACTION_SIZE): if not legal_moves[a]: continue Q = self.states_actions_Q.get((s, a), 0) N = self.states_actions_N.get((s, a), 0) U = Q + self.config.cpuct * self.states_P[s][a] * math.sqrt( self.states_N[s]) / (1 + N) if U > current_best: current_best = U best_move = a a = best_move next_state, next_player = game.get_next_state(state, 1, a) next_state = game.get_canonical_form(next_state, next_player) # the value is retrieved from the next state v = self.search(next_state) if (s, a) in self.states_actions_Q: self.states_actions_Q[( s, a)] = (self.states_actions_N[(s, a)] * self.states_actions_Q[ (s, a)] + v) / (self.states_actions_N[(s, a)] + 1) self.states_actions_N[(s, a)] += 1 else: self.states_actions_Q[(s, a)] = v self.states_actions_N[(s, a)] = 1 self.states_N[s] += 1 # the value is propagated up the remaining of the search path return -v