Пример #1
0
def test_boardenv_move_logic_four_in_a_row():
    # make sure the behavior is correct when a row is full of same values.
    init_state = [
        [2.0, 2.0, 2.0, 2.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    assert np.array_equal(init_state, b.state)
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert reward == 8
    assert state[0, 2] == 4 and state[0, 3] == 4, b.state
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert reward >= 8
    assert state[0, 3] == 8, b.state
Пример #2
0
def test_board_env_step_three():
    init_state = [
        [2.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert state[0, 3] == 2.0 and state[2, 3] == 2.0, state
Пример #3
0
def test_boardenv_done_logic():
    init_state = [
        [16.0, 8.0, 16.0, 4.0],
        [4.0, 2.0, 4.0, 8.0],
        [32.0, 2.0, 32.0, 4.0],
        [4.0, 16.0, 4.0, 8.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert not done and np.array_equal(state, np.array(init_state))
    assert reward == 0
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert not done and np.array_equal(state, np.array(init_state))
    assert reward == 0
    state, reward, done, _ = b.step(BoardEnv.LEFT)
    assert not done and np.array_equal(state, np.array(init_state))
    assert reward == 0
    state, reward, done, _ = b.step(BoardEnv.DOWN)
    assert done, state
    assert reward == 4
Пример #4
0
def test_board_env_step_one():
    # make sure the behavior is correct when a row is full of same values.
    init_state = [
        [2.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert state[0, 3] == 2.0 and state[2, 3] == 2.0
Пример #5
0
def test_board_env_step_two():
    init_state = [
        [4.0, 2.0, 2.0, 4.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    state, reward, done, _ = b.step(BoardEnv.RIGHT)
    assert state[0, 1] == 4.0
    assert state[0, 2] == 4.0
    assert state[0, 3] == 4.0
Пример #6
0
def test_boardenv_fill_on_move_logic():
    # make sure a new piece is added that is either a 2 or a 4
    init_state = [
        [2.0, 2.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    state, reward, done, _ = b.step(BoardEnv.LEFT)
    num_non_zero_spots = (b.state != 0).sum().sum()
    assert num_non_zero_spots == 2, state
Пример #7
0
def test_boardenv_move_logic_three_in_a_row():
    # make sure the behavior is correct when 3 elts are same in a row
    init_state = [
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 2.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
    ]
    b = BoardEnv().from_init_state(init_state)
    assert np.array_equal(init_state, b.state)
    state, reward, done, _ = b.step(BoardEnv.DOWN)
    assert reward == 4
    assert state[3, 1] == 4 and state[2, 1] == 2, b.state
Пример #8
0
class Andy2048(Base2048):
    info = "Andy's implementation of 2048"
    UP = BoardEnv.UP
    RIGHT = BoardEnv.RIGHT
    DOWN = BoardEnv.DOWN
    LEFT = BoardEnv.LEFT

    @classmethod
    def from_init_state(cls, init_state):
        andy_wrapper = cls()
        if isinstance(init_state, list) and len(init_state) == 16:
            init_state = np.array(init_state).reshape((4, 4))
        andy_wrapper.andy = BoardEnv.from_init_state(init_state)
        return andy_wrapper

    def __init__(self, random_seed=None):
        self.andy = BoardEnv(random_seed=random_seed)

    @property
    def board(self):
        board = []
        for row in self.andy.state:
            for el in row:
                board.append(int(el))
        return board

    @property
    def score(self):
        return self.andy.value

    @property
    def action_space(self):
        return self.andy.action_space

    def step(self, direction):
        _, reward, _, c = self.andy.step(direction)
        return self.board, reward, self.andy.done, c

    def get_state(self):
        return self.board, self.score, self.andy.done

    def set_board(self, board):
        self.andy.state = np.array(board[:]).reshape(4, 4)
Пример #9
0
    q_model.build(input_shape=(1, 16))
    optimizer = keras.optimizers.Adam(lr=params["learning_rate"])
    p_loss_fn = keras.losses.CategoricalCrossentropy()

    b = BoardEnv()
    done = False
    for episode_num in range(params["num_episodes"]):
        state = b.reset()
        action_probs = tf.squeeze(p_model(state[np.newaxis]), axis=0)
        dice_roll = tfp.distributions.Multinomial(total_count=1,
                                                  probs=action_probs).sample(1)
        action = b.action_space[np.argmax(dice_roll)]
        game_score = 0
        for step_num in range(params["max_steps_per_episode"]):
            # compute s'
            next_state, reward, done, _ = b.step(action)
            if np.array_equal(next_state,
                              state):  # don't keep trying dud moves
                break
            # compute a' and grad log pi(a'|s')
            with tf.GradientTape() as p_tape:
                action_probs = tf.squeeze(p_model(next_state[np.newaxis]),
                                          axis=0)
                dice_roll = tfp.distributions.Multinomial(
                    total_count=1, probs=action_probs).sample(1)
                p_loss = p_loss_fn(dice_roll, action_probs)
            p_grads = p_tape.gradient(p_loss, p_model.trainable_variables)
            next_action = b.action_space[np.argmax(dice_roll)]
            # compute q(s,a), q(s',a') and update q_model
            with tf.GradientTape() as q_tape:
                q_val = tf.squeeze(q_model(state[np.newaxis]))[action]