def pick_move(self, game, side): possible_moves = game.possible_moves(side) if len(possible_moves) == 0: possible_moves.append((-1,-1)) monte_prob = self.monte_carlo(game, side) if self.train: self.temp_state.append((self.preprocess_input(game.board, side), np.divide(monte_prob, np.sum(monte_prob)))) monte_prob = np.float_power(monte_prob, 1/self.tau) monte_prob = np.divide(monte_prob, np.sum(monte_prob)) r = random() for i, move in enumerate(possible_moves): r -= monte_prob[Othello.move_id(move)] if r <= 0: return move return possible_moves[-1]
def pick_move(self, game, side): possible_moves = game.possible_moves(side) if len(possible_moves) == 0: possible_moves.append((-1,-1)) monte_prob = self.monte_carlo(game, side) if self.train: self.temp_state.append((self.preprocess_input(game.board, side), np.divide(monte_prob, np.sum(monte_prob)))) monte_prob = np.float_power(monte_prob, 1/self.tau) monte_prob = np.divide(monte_prob, np.sum(monte_prob)) r = random() for i, move in enumerate(possible_moves): r -= monte_prob[Othello.move_id(move)] if r <= 0: return move return possible_moves[-1]
def monte_carlo(self, game, side): N = defaultdict(lambda: 0) W = defaultdict(lambda: 0) Q = defaultdict(lambda: 0) P = defaultdict(lambda: 0) possible_moves = game.possible_moves(side) if len(possible_moves) == 0: policy = np.zeros((65)) policy[64] = 1 return policy elif len(possible_moves) == 1: policy = np.zeros((65)) policy[Othello.move_id(possible_moves[0])] = 1 return policy current_input = self.preprocess_input(game.board, side) sid = Othello.state_id(game.board) pred = self.network.predict(current_input[np.newaxis, :]) policy = pred[0][0] total = 1e-10 for i, move in enumerate(possible_moves): total += policy[Othello.move_id(move)] for move in possible_moves: P[(sid, Othello.move_id(move))] = policy[Othello.move_id(move)] / total for i in range(self.sim_count): #print("Sim #%d"% i) clone = deepcopy(game) current_side = side visited = deque() while True: possible_moves = clone.possible_moves(current_side) if len(possible_moves) == 0: possible_moves.append((-1, -1)) best_move = None best_move_value = -2 sid = Othello.state_id(clone.board) for move in possible_moves: mid = Othello.move_id(move) qu_val = Q[(sid, mid)] + P[(sid, mid)] / (N[(sid, mid)] + 1) if qu_val > best_move_value: best_move_value = qu_val best_move = move #print(best_move) if N[(sid, Othello.move_id(best_move))] == 0: visited.append((sid, Othello.move_id(best_move))) clone.play_move(best_move[0], best_move[1], current_side) current_side *= -1 if clone.game_over(): for node in visited: N[node] += 1 W[node] += clone.get_winner() * side Q[node] = W[node] / N[node] break current_input = self.preprocess_input( clone.board, current_side) sid = Othello.state_id(clone.board) pred = self.network.predict(current_input[np.newaxis, :]) policy = pred[0][0] value = pred[1][0] possible_moves = clone.possible_moves(current_side) if len(possible_moves) == 0: possible_moves.append((-1, -1)) total = 1e-10 for i, move in enumerate(possible_moves): total += policy[Othello.move_id(move)] for move in possible_moves: P[(sid, Othello.move_id(move) )] = policy[Othello.move_id(move)] / total for node in visited: N[node] += 1 W[node] += value * side Q[node] = W[node] / N[node] #print() break else: visited.append((sid, Othello.move_id(best_move))) clone.play_move(best_move[0], best_move[1], current_side) current_side *= -1 if clone.game_over(): for node in visited: N[node] += 1 W[node] += clone.get_winner() * side Q[node] = W[node] / N[node] break policy = np.zeros((65)) possible_moves = game.possible_moves(side) sid = Othello.state_id(game.board) for move in possible_moves: mid = Othello.move_id(move) policy[mid] = N[(sid, mid)] return policy
def monte_carlo(self, game, side): N = defaultdict(lambda: 0) W = defaultdict(lambda: 0) Q = defaultdict(lambda: 0) P = defaultdict(lambda: 0) possible_moves = game.possible_moves(side) if len(possible_moves) == 0: policy = np.zeros((65)) policy[64] = 1 return policy elif len(possible_moves) == 1: policy = np.zeros((65)) policy[Othello.move_id(possible_moves[0])] = 1 return policy current_input = self.preprocess_input(game.board, side) sid = Othello.state_id(game.board) pred = self.network.predict(current_input[np.newaxis,:]) policy = pred[0][0] total = 1e-10 for i, move in enumerate(possible_moves): total += policy[Othello.move_id(move)] for move in possible_moves: P[(sid, Othello.move_id(move))] = policy[Othello.move_id(move)]/total for i in range(self.sim_count): #print("Sim #%d"% i) clone = deepcopy(game) current_side = side visited = deque() while True: possible_moves = clone.possible_moves(current_side) if len(possible_moves) == 0: possible_moves.append((-1,-1)) best_move = None best_move_value = -2 sid = Othello.state_id(clone.board) for move in possible_moves: mid = Othello.move_id(move) qu_val = Q[(sid, mid)] + P[(sid, mid)]/(N[(sid, mid)]+1) if qu_val > best_move_value: best_move_value = qu_val best_move = move #print(best_move) if N[(sid, Othello.move_id(best_move))] == 0: visited.append((sid, Othello.move_id(best_move))) clone.play_move(best_move[0], best_move[1], current_side) current_side *= -1 if clone.game_over(): for node in visited: N[node] += 1 W[node] += clone.get_winner()*side Q[node] = W[node]/N[node] break current_input = self.preprocess_input(clone.board, current_side) sid = Othello.state_id(clone.board) pred = self.network.predict(current_input[np.newaxis,:]) policy = pred[0][0] value = pred[1][0] possible_moves = clone.possible_moves(current_side) if len(possible_moves) == 0: possible_moves.append((-1,-1)) total = 1e-10 for i, move in enumerate(possible_moves): total += policy[Othello.move_id(move)] for move in possible_moves: P[(sid, Othello.move_id(move))] = policy[Othello.move_id(move)]/total for node in visited: N[node] += 1 W[node] += value*side Q[node] = W[node]/N[node] #print() break else: visited.append((sid, Othello.move_id(best_move))) clone.play_move(best_move[0], best_move[1], current_side) current_side *= -1 if clone.game_over(): for node in visited: N[node] += 1 W[node] += clone.get_winner()*side Q[node] = W[node]/N[node] break policy = np.zeros((65)) possible_moves = game.possible_moves(side) sid = Othello.state_id(game.board) for move in possible_moves: mid = Othello.move_id(move) policy[mid] = N[(sid,mid)] return policy