def npc_action(self): print("NPC action") valid_actions = othello.get_valid_actions(self.state, self.npc) if self.npc_type == "random": action = random.choice(valid_actions) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return elif self.npc_type == "eps-greedy": if random.random() > self.epsilon: best_action = None best_score = 0 for action in valid_actions: next_state, done = othello.step(self.state, action, self.npc) _, score = othello.count_stone(next_state) if score > best_score: best_score = score best_action = action self.state, done = othello.step(self.state, best_action, self.npc) else: action = random.choice(valid_actions) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return elif self.npc_type == "alphazero": mcts_policy = self.mcts.search(root_state=self.state, current_player=self.npc, num_simulations=50) print(np.array(mcts_policy[:-1]).reshape(6, 6)) action = np.argmax(mcts_policy) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return else: raise NotImplementedError()
def _expand(self, state, current_player): s = self.state_to_str(state, current_player) with tf.device("/cpu:0"): nn_policy, nn_value = self.network.predict( othello.encode_state(state, current_player)) nn_policy, nn_value = nn_policy.numpy().tolist()[0], nn_value.numpy( )[0][0] self.P[s] = nn_policy self.N[s] = [0] * othello.ACTION_SPACE self.W[s] = [0] * othello.ACTION_SPACE valid_actions = othello.get_valid_actions(state, current_player) #: cache valid actions and next state to save computation self.next_states[s] = [ othello.step(state, action, current_player)[0] if (action in valid_actions) else None for action in range(othello.ACTION_SPACE) ] return nn_value
def selfplay(weights, num_mcts_simulations, dirichlet_alpha): record = [] state = othello.get_initial_state() network = AlphaZeroResNet(action_space=othello.ACTION_SPACE) network.predict(othello.encode_state(state, 1)) network.set_weights(weights) mcts = MCTS(network=network, alpha=dirichlet_alpha) current_player = 1 done = False i = 0 while not done: mcts_policy = mcts.search(root_state=state, current_player=current_player, num_simulations=num_mcts_simulations) if i <= 10: # For the first 30 moves of each game, the temperature is set to τ = 1; # this selects moves proportionally to their visit count in MCTS action = np.random.choice(range(othello.ACTION_SPACE), p=mcts_policy) else: action = random.choice( np.where(np.array(mcts_policy) == max(mcts_policy))[0]) record.append(Sample(state, mcts_policy, current_player, None)) next_state, done = othello.step(state, action, current_player) state = next_state current_player = -current_player i += 1 #: win: 1, lose: -1, draw: 0 reward_first, reward_second = othello.get_result(state) for sample in reversed(record): sample.reward = reward_first if sample.player == 1 else reward_second return record
def player_action(self, event): if not self.is_player_turn or self.is_gameend: return else: self.is_player_turn = False print("Player action") row = event.y // 100 col = event.x // 100 action = othello.xy_to_idx(row, col) valid_actions = othello.get_valid_actions(self.state, self.human) #print(valid_actions, action) if valid_actions == [othello.ACTION_NOOP]: action = othello.ACTION_NOOP if action in valid_actions: self.state, done = othello.step(self.state, action, self.human) self.refresh() self.update_label() if done: self.update_label(game_end=True) return time.sleep(0.3) self.npc_action() if self.is_gameend: return else: print("Invalid action") self.is_player_turn = True return
def testplay(current_weights, num_mcts_simulations, dirichlet_alpha=None, n_testplay=24): t = time.time() win_count = 0 network = AlphaZeroResNet(action_space=othello.ACTION_SPACE) dummy_state = othello.get_initial_state() network.predict(othello.encode_state(dummy_state, 1)) network.set_weights(current_weights) for n in range(n_testplay): alphazero = random.choice([1, -1]) mcts = MCTS(network=network, alpha=dirichlet_alpha) state = othello.get_initial_state() current_player = 1 done = False while not done: if current_player == alphazero: mcts_policy = mcts.search(root_state=state, current_player=current_player, num_simulations=num_mcts_simulations) action = np.argmax(mcts_policy) else: action = othello.greedy_action(state, current_player, epsilon=0.3) next_state, done = othello.step(state, action, current_player) state = next_state current_player = -1 * current_player reward_first, reward_second = othello.get_result(state) reward = reward_first if alphazero == 1 else reward_second result = "win" if reward == 1 else "lose" if reward == -1 else "draw" if reward > 0: win_count += 1 stone_first, stone_second = othello.count_stone(state) if alphazero == 1: stone_az, stone_tester = stone_first, stone_second color = "black" else: stone_az, stone_tester = stone_second, stone_first color = "white" message = f"AlphaZero ({color}) {result}: {stone_az} vs {stone_tester}" othello.save_img(state, "img", f"test_{n}.png", message) elapsed = time.time() - t return win_count, win_count / n_testplay, elapsed