Exemplo n.º 1
0
    def _process_state_reward(self, state: np.ndarray, transform: dict,
                              move: tuple, reward: float, temp_disc: float,
                              equiv: bool, ind_to_loc: List[Tuple]):
        actions = state_to_actions(tuple(state.flatten()), ind_to_loc, 0)
        valid_inds = [ind_to_loc.index(a) for a in actions]
        values = self._state_values(state)
        adj_move = reverse_function(move, ind_to_loc, transform['func'],
                                    transform['args'])
        move_ind = ind_to_loc.index(adj_move)
        current = values[move_ind].item()

        # our network outputs probabilities for each action..
        # ..when we receive a reward for taking an action, we want to adjust that probability accordingly..
        # ..and adjust the other actions down to compensate (returning the sum to 1)
        # less credit is given to earlier moves, according to the temporal_discount_rate
        updated = self._update_value_with_reward(current, reward,
                                                 self.learning_rate, temp_disc)
        target = self._calc_target_values(values, current, updated, move_ind,
                                          valid_inds)
        loss = self.loss_fn(values, target)
        loss.backward()
        self.opt.step()
        self.opt.zero_grad()

        new_values = self._state_values(state)
        result = new_values[move_ind].item()
        mod = ValueMod(state=state,
                       move=move,
                       previous=current,
                       target=updated,
                       result=result,
                       equiv=equiv)
        return mod
Exemplo n.º 2
0
def test_state_to_actions():
    # arrange
    state = (0, 1, -1, 1, 0, -1, -1, 1, 0)
    expected_actions = [(0, 0), (1, 1), (2, 2)]

    # act
    actions = state_to_actions(state, Game.ind_to_loc, Game.empty_marker)

    # assert
    assert set(actions) == set(expected_actions)
Exemplo n.º 3
0
def test_NeuralPlayer_play(net):
    game = Game()
    lr = 0.25
    agent = NeuralPlayer(net, lr)
    marker = 1
    state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape)
    game.state = state
    actions = state_to_actions(tuple(state.flatten()), game.ind_to_loc,
                               game.empty_marker)

    loc = agent.play(marker, game)

    assert isinstance(loc, tuple)
    assert loc in actions
Exemplo n.º 4
0
def show_move_values(player: Player, game: Game):
    """Show learned values for game state.

    Args:
        player: instance of Player class
        game: instance of Game class
    """

    # TODO: function for this and play()
    actions = state_to_actions(tuple(game.state.flatten()), game.ind_to_loc,
                               game.empty_marker)
    raw_values = player._policy(game.turn, game)
    valid_inds = [game.ind_to_loc.index(a) for a in actions]
    valid_values = [
        raw_values[ind] if ind in valid_inds else 0
        for ind in range(len(raw_values))
    ]
    if sum(valid_values) <= 0:
        values = [1 / len(valid_values) for v in valid_values]
    else:
        values = [v / sum(valid_values) for v in valid_values]
    values = np.reshape(values, game.board_shape)

    _, ax = plt.subplots(figsize=(4.5, 4.5))
    _ = plt.plot([1, 1], [0, -3], 'k-', linewidth=4)
    _ = plt.plot([2, 2], [0, -3], 'k-', linewidth=4)
    _ = plt.plot([0, 3], [-1, -1], 'k-', linewidth=4)
    _ = plt.plot([0, 3], [-2, -2], 'k-', linewidth=4)
    for x, y in game.ind_to_loc:
        if game.state[x, y] != 0:
            mark = 'x' if game.state[x, y] == 1 else 'o'
            plt.text(y + 0.275, -x - 0.725, mark, size=60)
        else:
            plt.text(y + 0.35, -x - 0.575, round(values[x, y], 2), size=15)
            square = patches.Rectangle((y, -x - 1),
                                       1,
                                       1,
                                       linewidth=0,
                                       edgecolor='none',
                                       facecolor='r',
                                       alpha=values[x, y] * 0.75)
            ax.add_patch(square)
    _ = ax.axis('off')
Exemplo n.º 5
0
    def play(self, marker: int, game: Game) -> Tuple[int]:
        """Player's action during their turn.

        Args:
            marker: int, player's marker in this game
            game: instance of Game

        Returns:
            loc: tuple of int, action (board location)
        """

        actions = state_to_actions(tuple(game.state.flatten()),
                                   game.ind_to_loc, game.empty_marker)
        if len(actions) == 0:
            raise Error('no available actions')
        raw_values = self._policy(marker, game)
        valid_inds = [game.ind_to_loc.index(a) for a in actions]
        valid_values = [raw_values[ind] for ind in valid_inds]
        if sum(valid_values) <= 0:
            values = [1 / len(valid_values) for v in valid_values]
        else:
            values = [v / sum(valid_values) for v in valid_values]
        loc_inds = [i for i in range(len(values))]
        if self.explore:
            # limit the minimum probability for an action
            probs = [
                v if v > self.min_probability else self.min_probability
                for v in values
            ]
            probs = [v / sum(probs) for v in probs]
            # take action with probability proportional to value
            loc_ind = np.random.choice(loc_inds, p=probs)
        else:
            # exploit - take action with highest value
            loc_ind = loc_inds[np.argmax(values)]
        loc = actions[loc_ind]
        return loc
Exemplo n.º 6
0
def initialize_value_map(init_val: float) -> dict:
    """Initialize a value map.

    Args:
        init_val: float, initial value

    Returns:
        init_value_map: dict, value map
    """

    prod_combs = product(Game.valid_markers + [Game.empty_marker],
                         repeat=Game.board_shape[0]**2)
    valid_combs = [pc for pc in prod_combs if abs(sum(pc)) < 2]

    non_dupes = []
    for vc in valid_combs:
        swap = tuple([elem * -1 for elem in vc])
        if swap not in non_dupes:
            non_dupes.append(vc)

    combs = []
    for nd in non_dupes:
        c_box = np.reshape(nd, Game.board_shape)
        rot90 = np.rot90(c_box)
        if tuple(rot90.flatten()) in combs:
            continue
        rot180 = np.rot90(c_box, k=2)
        if tuple(rot180.flatten()) in combs:
            continue
        rot270 = np.rot90(c_box, k=3)
        if tuple(rot270.flatten()) in combs:
            continue
        lr = np.fliplr(c_box)
        if tuple(lr.flatten()) in combs:
            continue
        ud = np.flipud(c_box)
        if tuple(ud.flatten()) in combs:
            continue
        combs.append(nd)

    # can't have more than one valid won state
    states = []
    for c in combs:
        game = Game()
        game.state = np.reshape(c, Game.board_shape)
        try:
            game._update_won()
            states.append(c)
        except ValueError:
            pass

    init_value_map = {
        s: {
            m: {
                a: init_val
                for a in state_to_actions(s, Game.ind_to_loc,
                                          Game.empty_marker)
            }
            for m in [-1, 1]
        }
        for s in states
    }

    for s in init_value_map:
        game = Game()
        game.state = np.reshape(s, game.board_shape)
        game._update_won()
        for m in init_value_map[s]:
            # won state: no actions, just reward value
            if game.won in game.valid_markers:
                init_value_map[s][m] = 1 if m == game.won else 0
            # full board: no actions, just initial value
            elif len(init_value_map[s][m]) == 0:
                init_value_map[s][m] = INITIAL_VALUE
            # cannot be marker's turn: no actions
            # NOTE: I don't explicitly reverse transform a marker swap
            #       so can't assume markers will match
            # elif sum(s) == m:
            #     init_value_map[s][m] = {}

    return init_value_map