Exemplo n.º 1
0
def wythoff_dqn2(epsilon=0.1,
                 gamma=0.5,
                 learning_rate=1e-6,
                 num_episodes=100,
                 batch_size=20,
                 memory_capacity=100,
                 game='Wythoff10x10',
                 network='DQN_xy1',
                 anneal=False,
                 tensorboard=None,
                 update_every=5,
                 double=False,
                 double_update=10,
                 save=False,
                 save_model=False,
                 monitor=None,
                 return_none=False,
                 debug=False,
                 device='cpu',
                 clip_grad=False,
                 progress=False,
                 zero=False,
                 seed=None):
    """Learning Wythoff's, with a DQN."""

    # ------------------------------------------------------------------------
    # Init
    num_episodes = int(num_episodes)
    batch_size = int(batch_size)
    memory_capacity = int(memory_capacity)
    update_every = int(update_every)

    # Logs...
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # Env...
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------------------
    # Init
    #
    # Scores
    score = 0
    total_reward = 0

    # Agents, etc
    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)

    # Is network a nn.Module?
    if hasattr(network, "forward"):
        Model = network
    # Is it the name of a azad model?
    else:
        Model = getattr(azad.models, network)

    player = Model().to(device)
    target = Model().to(device)

    if double:
        target.load_state_dict(player.state_dict())
        target.eval()
    else:
        target = None

    if debug:
        print(f"---------------------------------------")
        print("Setting up....")
        print(f">>> Device: {device}")
        print(f">>> Network is {player}")
        print(f">>> Memory capacity {memory_capacity} ({batch_size})")

    memory = ReplayMemory(memory_capacity)
    # optimizer = optim.Adam(player.parameters(), learning_rate)
    optimizer = optim.SGD(player.parameters(), learning_rate)
    moves = MoveCount(m, n)
    opts = OptimalCount(0)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Re-init
        transitions = []
        state = env.reset()
        x, y, board, available = state
        moves.update((x, y))
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = epsilon

        # -------------------------------------------------------------------
        # Play a game
        steps = 1
        done = False
        while not done:
            # Choose a move
            Qs = build_Qs(player,
                          state,
                          available,
                          device=device,
                          mode="numpy")
            move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy')
            move = available[move_i]
            moves.update(move)

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next

            # Track value statistics
            total_reward += reward
            Q = Qs[move_i]
            prediction_error = Qs.max() - Q
            advantage = Q - Qs[np.nonzero(Qs)].mean()

            # Save transitions, as tensors to be used at training time
            # (onto GPU)
            transitions.append([
                # S
                torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(),
                # A
                torch.tensor(move).unsqueeze(0),
                # S'
                torch.tensor(
                    (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(),
                # R
                torch.tensor([reward]).unsqueeze(0).float(),
            ])

            # -
            if debug:
                print(f">>> position: {(x, y)}")
                print(f">>> num available: {len(available)}")
                print(f">>> available: {available}")
                print(f">>> Qs (filtered): {Qs}")
                print(f">>> new position: ({x_next}, {y_next})")

            # Shift states
            state = deepcopy(state_next)
            board = deepcopy(board_next)
            available = deepcopy(available_next)
            x = deepcopy(x_next)
            y = deepcopy(y_next)

            steps += 1

        # ----------------------------------------------------------------
        # Learn from the game
        #
        # Find the losers transition and update its reward w/ -reward
        if steps > 2:
            transitions[-2][3] = transitions[-1][3] * -1

        # Update the memories using the transitions from this game
        for i in range(0, len(transitions)):
            memory.push(*transitions[i])

        if debug:
            print(f">>> final transitions: {transitions[-2:]}")

        # Bypass if we don't have enough in memory to learn
        if episode < batch_size:
            continue

        # Learn, samping a batch of transitions from memory
        player, loss = train_dqn(batch_size,
                                 player,
                                 memory,
                                 optimizer,
                                 device,
                                 target=target,
                                 gamma=gamma,
                                 clip_grad=clip_grad)

        # Update target net, if in double mode and time is right.
        if double and (episode % double_update == 0):
            target.load_state_dict(player.state_dict())

        # ----------------------------------------------------------------
        # Logs...
        if progress:
            print(f"---")
        if progress or debug:
            print(f">>> episode: {episode}")
        if debug or progress:
            print(f">>> loss {loss}")
            print(f">>> Q(last,a): {Q}")
            print(f">>> epsilon: {epsilon_e}")
            print(f">>> score: {score}")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', reward, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('loss', loss, episode)
            writer.add_scalar('steps', steps, episode)
            writer.add_scalar('score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(cold,
                               vmin=0,
                               vmax=1,
                               path=tensorboard,
                               name='cold_board.png')
            writer.add_image('cold_positions',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'cold_board.png'))),
                             0,
                             dataformats='HWC')

            # Extract all value boards, and find extrema
            values = torch.zeros((len(all_possible_moves), m, n))
            for i, a in enumerate(all_possible_moves):
                sample_hat = np.asarray(create_board(a[0], a[1], m, n))

                sample_hat = torch.from_numpy(sample_hat)
                sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float()

                values[i, :, :] = player(sample_hat).detach().float().reshape(
                    m, n)

            mean_values = torch.mean(values, 0)
            max_values, _ = torch.max(values, 0)
            min_values, _ = torch.min(values, 0)

            # Log
            writer.add_scalar('Q_mean', torch.mean(mean_values), episode)
            writer.add_scalar('Q_min', torch.mean(min_values), episode)
            writer.add_scalar('Q_max', torch.mean(max_values), episode)

            # Plot mean
            plot_wythoff_board(mean_values.numpy(),
                               vmin=mean_values.numpy().min(),
                               vmax=mean_values.numpy().max(),
                               path=tensorboard,
                               name='player_mean_values.png')
            writer.add_image('mean player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_mean_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot max
            plot_wythoff_board(max_values.numpy(),
                               vmin=max_values.numpy().min(),
                               vmax=max_values.numpy().max(),
                               path=tensorboard,
                               name='player_max_values.png')
            writer.add_image('max player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_max_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot min
            plot_wythoff_board(min_values.numpy(),
                               vmin=min_values.numpy().min(),
                               vmax=min_values.numpy().max(),
                               path=tensorboard,
                               name='player_min_values.png')
            writer.add_image('min player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_min_values.png'))),
                             0,
                             dataformats='HWC')

            # Plot move count
            plot_wythoff_board(moves.count,
                               vmax=moves.count.max() / 10,
                               vmin=0,
                               path=tensorboard,
                               name='moves.png')
            writer.add_image('moves',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard, 'moves.png'))),
                             0,
                             dataformats='HWC')

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if monitor and save:
        save_monitored(save, monitored)
    if tensorboard:
        writer.close()

    result = {"player": player.state_dict(), "score": score}
    if target is not None:
        result['target'] = target.state_dict()
    if save:
        torch.save(result, save + ".pytorch")

    if monitor and not save:
        result["monitored"] = monitored

    if return_none:
        result = None

    return result
Exemplo n.º 2
0
def wythoff_stumbler(num_episodes=10,
                     epsilon=0.1,
                     gamma=0.8,
                     learning_rate=0.1,
                     game='Wythoff10x10',
                     model=None,
                     opponent=None,
                     anneal=False,
                     bias_board=None,
                     influence=0.0,
                     score=0.0,
                     total_reward=0.0,
                     tensorboard=None,
                     update_every=5,
                     initial=0,
                     self_play=False,
                     save=False,
                     load_model=None,
                     save_model=False,
                     monitor=None,
                     return_none=False,
                     debug=False,
                     seed=None):
    """Learn to play Wythoff's w/ e-greedy random exploration.
    
    Note: Learning is based on a player-opponent joint action formalism 
    and tabular Q-learning.
    """

    # ------------------------------------------------------------------------
    # Init env
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    # Create env
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # ------------------------------------------------------------------------
    # Init Agents
    default_Q = 0.0
    m, n, board, available = peek(env)
    if model is None:
        model = {}
    if opponent is None:
        opponent = {}

    # Override from file?
    if load_model is not None:
        if debug:
            print(">>> Loadiing model/opponent from {}".format(load_model))

        model, opponent = load_stumbler(model, opponent, load_model)

    # ------------------------------------------------------------------------
    for episode in range(initial, initial + num_episodes):
        # Re-init
        steps = 1

        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))
        if debug:
            print("---------------------------------------")
            print(">>> NEW GAME ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))
            print(">>> Initial moves {}".format(available))
            print("---------------------------------------")

        t_state = [
            board,
        ]
        t_available = [available]
        t_move = []
        t_move_i = []
        t_reward = []

        # -------------------------------------------------------------------
        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = episode

        # -------------------------------------------------------------------
        # Play!
        done = False
        player_win = False
        while not done:
            # PLAYER CHOOSES A MOVE
            try:
                Qs_episode = add_bias_board(model[board], available,
                                            bias_board, influence)
                move_i = epsilon_greedy(
                    Qs_episode, epsilon=epsilon_e, mode='numpy')
            except KeyError:
                model[board] = np.ones(len(available)) * default_Q
                move_i = np.random.randint(0, len(available))

            move = available[move_i]

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # PLAY THE MOVE
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            steps += 1

            # Log....
            if debug:
                print(">>> PLAYER move {}".format(move))

            t_state.append(board)
            t_move.append(move)
            t_available.append(available)
            t_move_i.append(move_i)
            t_reward.append(reward)

            if done:
                player_win = True
                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

            # ----------------------------------------------------------------
            if not done:
                # OPPONENT CHOOSES A MOVE
                try:
                    Qs_episode = add_bias_board(opponent[board], available,
                                                bias_board, influence)
                    move_i = epsilon_greedy(
                        Qs_episode, epsilon=epsilon_e, mode='numpy')
                except KeyError:
                    opponent[board] = np.ones(len(available)) * default_Q
                    move_i = np.random.randint(0, len(available))

                move = available[move_i]

                # PLAY THE MOVE
                (x, y, board, available), reward, done, _ = env.step(move)
                board = tuple(flatten_board(board))
                steps += 1

                # Log....
                if debug:
                    print(">>> OPPONENT move {}".format(move))

                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

                if done:
                    t_state.append(board)
                    t_move.append(move)
                    t_available.append(available)
                    t_move_i.append(move_i)
                    t_reward.append(reward)

        # ----------------------------------------------------------------
        # Learn by unrolling the last game...

        # PLAYER (model)
        s_idx = np.arange(0, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = model[s][m_i]

            try:
                max_Q = model[next_s].max()
            except KeyError:
                model[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = model[next_s].max()

            if player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Update running reward total for player
            total_reward += r

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            model[s][m_i] = Q + (learning_rate * loss)

        # OPPONENT
        s_idx = np.arange(1, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = opponent[s][m_i]

            try:
                max_Q = opponent[next_s].max()
            except KeyError:
                opponent[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = opponent[next_s].max()

            if not player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            opponent[s][m_i] = Q + (learning_rate * loss)

        # ----------------------------------------------------------------
        # Update the log
        if debug:
            print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format(
                r, Q, next_Q, loss))

            if done and (r > 0):
                print("*** WIN ***")
            if done and (r < 0):
                print("*** OPPONENT WIN ***")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', r, episode)
            writer.add_scalar('Q', Q, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('stumber_error', loss, episode)
            writer.add_scalar('stumber_steps', steps, episode)
            writer.add_scalar('stumbler_score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(
                cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png')
            writer.add_image(
                'cold_positions',
                skimage.io.imread(os.path.join(tensorboard, 'cold_board.png')))

            # Agent max(Q) boards
            values = expected_value(m, n, model)
            plot_wythoff_board(
                values, path=tensorboard, name='player_max_values.png')
            writer.add_image(
                'player',
                skimage.io.imread(
                    os.path.join(tensorboard, 'player_max_values.png')))

            values = expected_value(m, n, opponent)
            plot_wythoff_board(
                values, path=tensorboard, name='opponent_max_values.png')
            writer.add_image(
                'opponent',
                skimage.io.imread(
                    os.path.join(tensorboard, 'opponent_max_values.png')))

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if save_model:
        state = {
            'stumbler_player_dict': model,
            'stumbler_opponent_dict': opponent
        }
        torch.save(state, save + ".pytorch")

    if monitor:
        save_monitored(save, monitored)

    if tensorboard:
        writer.close()

    result = (model, opponent), (score, total_reward)
    if return_none:
        result = None

    return result
Exemplo n.º 3
0
def evaluate_dqn2(path,
                  game='Wythoff15x15',
                  num_episodes=100,
                  opponent='self',
                  model_name="player",
                  network='DQN_xy2',
                  monitor=None,
                  save=None,
                  debug=False,
                  return_none=False,
                  seed=None):
    """Evaulate transfer on frozen DQN model."""

    # ------------------------------------------------------------------------
    # Arg sanity
    num_episodes = int(num_episodes)
    opponents = ('self', 'random', 'optimal', 'efficient')
    if opponent not in opponents:
        raise ValueError(f"opponent must be {opponents}")

    # Logs
    if monitor is not None:
        monitored = create_monitored(monitor)

    # Init
    total_reward = 0
    opts = OptimalCount(0)

    # Env
    env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)

    # Load the final model to evaluate
    result = torch.load(path, map_location=torch.device('cpu'))
    state_dict = result[model_name]

    # Is network a nn.Module?
    if hasattr(network, "forward"):
        Model = network
    # Is it the name of a azad model?
    else:
        Model = getattr(azad.models, network)

    model = Model().to("cpu")
    model.load_state_dict(state_dict)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Random player moves first
        player = 0

        # Init this game
        state = env.reset()
        x, y, board, available = state
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # ---
        # Play a game
        done = False
        steps = 0
        score = 0
        while not done:
            # Optimal moves
            colds = locate_cold_moves(x, y, available)

            # Build value array
            Qs = build_Qs(model, state, available, device="cpu", mode="numpy")

            # Choose a move, based on the player code and the opponent type.
            # Player 0 is always the final model we are evaluating.
            if player == 0:
                move_i = e_greedy(Qs, epsilon=0, mode='numpy')
                move = available[move_i]
            elif (player == 1) and (opponent == 'self'):
                move_i = e_greedy(Qs, epsilon=0, mode='numpy')
                move = available[move_i]
            elif (player == 1) and (opponent == 'random'):
                move = random.choice(available)
            elif (player == 1) and (opponent == 'optimal'):
                if len(colds) > 0:
                    move = random.choice(colds)
                else:
                    move = random.choice(available)
            elif (player == 1) and (opponent == 'efficient'):
                if len(colds) > 0:
                    distances = [sum(c) for c in colds]
                    move_i = np.argmin(distances)
                    move = colds[move_i]
                else:
                    move = random.choice(available)

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next

            # Analyze it if it was the player
            best = 0.0
            if (player == 0) and cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # -
            if debug:
                print(f">>> available: {available}")
                print(f">>> Qs: {Qs}")
                print(f">>> new position: ({x_next}, {y_next})")

            # Shift for next play?
            if not done:
                # Shift states
                state = deepcopy(state_next)
                board = deepcopy(board_next)
                available = deepcopy(available_next)
                x = deepcopy(x_next)
                y = deepcopy(y_next)
                player = shift_player(player)
                steps += 1

            # Tabulate wins, and totals (p1)
            if done:
                winner = player
            if player == 0 and done:
                total_reward += reward

        # ---
        # Save into monitored after each game
        if monitor:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # ---
    # Tournament over....
    # Build a result, focused on player
    result = {}
    result["total_reward"] = total_reward

    # Save? The end.
    if monitor and save is not None:
        save_monitored(save, monitored)
    if monitor and not save:
        result["monitored"] = monitored
    if return_none:
        return None
    else:
        return result
Exemplo n.º 4
0
def evaluate_wythoff(stumbler=None,
                     strategist=None,
                     stumbler_game='Wythoff10x10',
                     strategist_game='Wythoff50x50',
                     random_stumbler=False,
                     load_model=None,
                     save=None,
                     return_none=False,
                     num_episodes=100,
                     debug=False):
    """Compare stumblers to strategists.
    
    Returns 
    -------
    wins : float
        the fraction of games won by the strategist.
    """
    # ------------------------------------------------------------------------
    if load_model is not None:
        stumbler, _, strategist = load_for_eval(load_model)

    # Init boards, etc
    # Stratgist
    env = create_env(strategist_game, monitor=False)
    m, n, board, _ = peek(env)
    if strategist is not None:
        hot_cold_table = create_bias_board(m, n, strategist)
    else:
        hot_cold_table = np.zeros_like(board)

    # Stumbler
    o, p, _, _ = peek(create_env(stumbler_game, monitor=False))

    # ------------------------------------------------------------------------
    # A stumbler and a strategist take turns playing a (m,n) game of wythoffs
    wins = 0.0
    strategist_score = 0.0
    stumbler_score = 0.0
    for episode in range(num_episodes):
        # Re-init
        steps = 0

        # Start the game, and process the result
        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))

        if debug:
            print("---------------------------------------")
            print(">>> NEW MODEL EVALUATION ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))

        done = False
        while not done:
            # ----------------------------------------------------------------
            # STUMBLER
            if (x < o) and (y < p):
                s_board = tuple(flatten_board(create_board(x, y, o, p)))
                s_available = create_moves(x, y)
                try:
                    values = stumbler[s_board]
                    move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy')
                    move = s_available[move_i]
                except KeyError:
                    move_i = np.random.randint(0, len(s_available))
                    move = s_available[move_i]
            else:
                s_available = available
                move_i = np.random.randint(0, len(s_available))
                move = s_available[move_i]

            # ----------------------------------------------------------------
            # RANDOM PLAYER
            if random_stumbler:
                move_i = np.random.randint(0, len(available))
                move = available[move_i]

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, s_available):
                if move in locate_cold_moves(x, y, s_available):
                    best = 1.0
                stumbler_score += (best - stumbler_score) / (episode + 1)

            # Move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if debug:
                print(">>> STUMBLER move {}".format(move))

            if done:
                break

            # ----------------------------------------------------------------
            # STRATEGIST
            # Choose.
            hot_cold_move_values = [hot_cold_table[i, j] for i, j in available]
            move_i = epsilon_greedy(
                np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy')
            move = available[move_i]

            if debug:
                print(">>> STRATEGIST move {}".format(move))

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                strategist_score += (best - strategist_score) / (episode + 1)

            # Make a move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if done:
                wins += 1.0
                break

        if debug:
            print("Wins {}, Scores ({}, {})".format(wins, stumbler_score,
                                                    strategist_score))

    if save is not None:
        np.savetxt(
            save,
            np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3),
            fmt='%.1f,%.4f,%.4f',
            comments="",
            header="wins,stumbler_score,strategist_score")

    result = (wins / num_episodes), stumbler_score, strategist_score
    if return_none:
        result = None

    return result