def play(next_actions): state = State() while True: if state.is_done(): break action_idx = 0 if state.is_first_player() else 1 next_action = next_actions[action_idx] action = next_action(state) state = state.next_state(action) return first_player_point(state)
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # Add state and policy into history policies = [0 for _ in range(DN_OUTPUT_SIZE)] for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([[state.pieces, state.enemy_pieces], policies, None]) action = np.random.choice(state.legal_actions(), p=scores) state = state.next_state(action) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
C_PUCT = 1.0 t = sum(nodes_to_scores(self.child_nodes)) pucb_values = [ (-child_node.w / child_node.n if child_node.n > 0 else 0.0) + C_PUCT * child_node.policy * math.sqrt(t) / (1 + child_node.n) for child_node in self.child_nodes ] return self.child_nodes[np.argmax(pucb_values)] def pv_mcts_action(model, temperature=0): def pv_mcts_action(state): scores = pv_mcts_scores(model, state, temperature) return np.random.choice(state.legal_actions(), p=scores) return pv_mcts_action if __name__ == '__main__': path = sorted(Path('./model').glob('*.h5'))[-1] model = tf.keras.models.load_model(str(path)) state = State() next_action = pv_mcts_action(model, 1.0) while True: if state.is_done(): break action = next_action(state) state = state.next_state(action) print(state)